TASK := kmeans
SRCDIRS := .
SRCEXT := cpp
SOURCES := $(wildcard $(SRCDIRS)/*.$(SRCEXT))
OBJECTS := $(SOURCES:%.cpp=%.o)
DEPENDENCIES := $(OBJECTS:%.o=%.d)
EXECUTABLE ?= ${TASK}.exe
DIRNAME := $(notdir ${CURDIR})

# Filter out main_soa.cpp for classic build, and main.cpp for SOA build
CLASSIC_SOURCES := $(filter-out %main_soa.cpp, $(SOURCES))
CLASSIC_OBJECTS := $(CLASSIC_SOURCES:%.cpp=%.o)

INPUT_DIR := input

CXX ?= clang++
COMPILER = ${CXX}

ifeq (${CC}, icx)
	FLAGS_OPENMP = -fiopenmp
else ifeq (${CC}, scorep-icx)
	FLAGS_OPENMP = -fiopenmp
else ifeq (${CC}, icpx)
	FLAGS_OPENMP = -qopenmp
else
	FLAGS_OPENMP = -fopenmp
endif

FLAGS = -Wall -std=c++14 -g -fno-omit-frame-pointer ${FLAGS_OPENMP}
FLAGS_FAST = -O3
FLAGS_DEBUG = -O0 -Wall -Wextra
INCLUDES = $(addprefix -I, ${SRCDIRS})
LDLIBS =

NTHREADS ?= 1
GROUP ?= X

# set default build target
build: release

# build for debugging
debug: FLAGS += ${FLAGS_DEBUG}
debug: ${EXECUTABLE}

# build for performance
release: FLAGS += ${FLAGS_FAST}
release: ${EXECUTABLE}

# classic target
${EXECUTABLE}: ${CLASSIC_OBJECTS}
	${COMPILER} ${FLAGS} -o $@ $^ ${LDLIBS}

%.o: %.${SRCEXT}
	${COMPILER} ${INCLUDES} -MMD -MP ${FLAGS} -c -o $@ $<

input-large: ${INPUT_DIR}/large.in

${INPUT_DIR}/large.in: ${INPUT_DIR}/gen_input.py
	module load Python && python ${INPUT_DIR}/gen_input.py --file input/large.in --k ${NCENTERS} --n ${NPOINTS} --dim_x ${DIM} --dim_y ${DIM} 

run-small: NPOINTS = 1000
run-small: DIM = 100
run-small: NCENTERS ?= 5
run-small: NITERS ?= 20
run-small: release
	OMP_NUM_THREADS=${NTHREADS} ./${EXECUTABLE} ${INPUT_DIR}/small.in ${DIM} ${NPOINTS} ${NCENTERS} ${NITERS}

run-mid: NPOINTS = 1000000
run-mid: DIM = 100
run-mid: NCENTERS ?= 5000
run-mid: NITERS ?= 50
run-mid: release
	OMP_NUM_THREADS=${NTHREADS} ./${EXECUTABLE} ${INPUT_DIR}/mid.in ${DIM} ${NPOINTS} ${NCENTERS} ${NITERS}

run-large: NPOINTS = 100000000
run-large: DIM = 1000
run-large: NCENTERS ?= 50000
run-large: NITERS ?= 100
run-large: release input-large
	OMP_NUM_THREADS=${NTHREADS} ./${EXECUTABLE} ${INPUT_DIR}/large.in ${DIM} ${NPOINTS} ${NCENTERS} ${NITERS}

vis-small:
	module load Python && ./utils/make-gif.py ${INPUT_DIR}/small.in memory.out clusters.gif

vis-mid:
	module load Python && ./utils/make-gif.py ${INPUT_DIR}/mid.in memory.out clusters.gif

vis-large:
	module load Python && ./utils/make-gif.py ${INPUT_DIR}/large.in memory.out clusters.gif

archive: clean
	find . -maxdepth 1 -type f -exec tar --transform 's|^|${DIRNAME}-group-${GROUP}/|g' -cvzf ${DIRNAME}-group-${GROUP}.tar.gz {} +

gcc-O2:
	$(MAKE) clean release CXX=g++ FLAGS_FAST="-O2 -march=native -mtune=native" EXECUTABLE=kmeans-gcc-O2

gcc-O3:
	$(MAKE) clean release CXX=g++ FLAGS_FAST="-O3 -march=native -mtune=native " EXECUTABLE=kmeans-gcc-O3

gcc-Ofast:
	$(MAKE) clean release CXX=g++ FLAGS_FAST="-Ofast -march=native -mtune=native" EXECUTABLE=kmeans-gcc-Ofast

clang-O2:
	$(MAKE) clean release CXX=clang++ FLAGS_FAST="-O2 -march=native -mtune=native" FLAGS_OPENMP="-fopenmp" EXECUTABLE=kmeans-clang-O2

clang-O3:
	$(MAKE) clean release CXX=clang++ FLAGS_FAST="-O3 -march=native -mtune=native" FLAGS_OPENMP="-fopenmp" EXECUTABLE=kmeans-clang-O3

clang-O3-ffast-math:
	$(MAKE) clean release CXX=clang++ FLAGS_FAST="-O3 -ffast-math -march=native -mtune=native" FLAGS_OPENMP="-fopenmp" EXECUTABLE=kmeans-clang-O3-ffast-math

icpx-O2:
	$(MAKE) clean release CXX=icpx CC=icpx FLAGS_FAST="-O2 -xHost -mtune=native" EXECUTABLE=kmeans-icpx-O2

icpx-O3:
	$(MAKE) clean release CXX=icpx CC=icpx FLAGS_FAST="-O3 -xHost -mtune=native" EXECUTABLE=kmeans-icpx-O3

icpx-Ofast:
	$(MAKE) clean release CXX=icpx CC=icpx FLAGS_FAST="-Ofast -xHost -mtune=native" EXECUTABLE=kmeans-icpx-Ofast

acfl-O2:
	$(MAKE) clean release CXX=armclang++ CC=armclang++ FLAGS_FAST="-O2 -grecord-gcc-switches -march=native -mcpu=native" EXECUTABLE=kmeans-acfl-O2

acfl-O3:
	$(MAKE) clean release CXX=armclang++ CC=armclang++ FLAGS_FAST="-O3 -grecord-gcc-switches -march=native -mcpu=native" EXECUTABLE=kmeans-acfl-O3

acfl-Ofast:
	$(MAKE) clean release CXX=armclang++ CC=armclang++ FLAGS_FAST="-Ofast -grecord-gcc-switches -march=native -mcpu=native" EXECUTABLE=kmeans-acfl-Ofast

.PHONY: clean build debug release run-small run-mid run-large vis-small vis-mid vis-large archive gcc-O3 gcc-O3-funroll gcc-Ofast clang-O3 clang-O3-funroll clang-Ofast icpx-O3 icpx-O3-funroll icpx-Ofast

clean:
	${RM} ${EXECUTABLE}
	${RM} ${OBJECTS}
	${RM} ${DEPENDENCIES}
	${RM} *.gif

-include ${DEPENDENCIES}
