(ns knn.core)
(use '[knn.distance :as distance])
(use '[clojure.java.io :as io])
(use '[clojure.string :as string])
(defstruct observation :label :observation)

Counter of the value in the vector

(defn- counter
  [v value]
  (count (filter (partial == value) v)))

Labeling schema among observations

(defn- majority-label
  [observations]
  (last (sort-by (partial counter (map :label observations)) (set (map :label observations)))))

Return the closest k nearest neighbors based on distance function

(defn- nearest-neighbors
  [observation data distance-function k]
  (take k (sort-by #(distance-function (:observation observation)
					(:observation %)) data)))

Pairwise distance matrix for observations

(defn- pairwise-distance-matrix
  [vectors distance-function]
  (vec (map #(partial distance-function %) vectors)))

Predict the example based on training

(defn predict
  [training test-data distance-function k]
  (vec (map #(majority-label (nearest-neighbors % training distance-function k)) test-data)))

Return the file contents in the form of a vector where every line is an element

(defn read-lines
  [file-path]
  (with-open [x (io/reader file-path)]
    (vec (line-seq x))))

Parse line into two separate parts

(defn parse-line
  [line]
  (#(struct observation (first %) (rest %)) (map #(Float/parseFloat %) (.split line " "))))

Parse vector into label and observation

(defn parse-vector
  [v]
  (#(struct observation (first %) (rest %)) v))

Read csv file

(defn read-csv
  [file-path delimiter]
  (with-open [rd (io/reader (io/file file-path))]
  (->> (line-seq rd)
       (map #(.split ^String % delimiter))
       (mapv vec))))

Convert Iris Labels into integer equivalents

(defn- convert-iris-labels
  [label]
  (cond
    (= label "Iris-setosa") 0.0
    (= label "Iris-versicolor") 1.0
    (= label "Iris-virginica") 2.0))

Convert Iris Dataset in the form of label observations

(defn- get-iris-dataset
  [iris-file-path]
  (let [iris-dataset (read-csv iris-file-path ",")
        iris-labels (map  convert-iris-labels (map last iris-dataset))
        iris-observations (map #(into [] (map bigdec (butlast %))) iris-dataset)]
    (map parse-vector (map #(into [] %) (map cons iris-labels iris-observations)))))

Main Function

(defn -main
  [& args]
  (def train-file-path "data/train.txt")
  (def test-file-path "data/test.txt")
  ; Number of nearest neighbors
  (def k 5)
  (def training (vec (map parse-line (read-lines train-file-path))))
  (def test-data (vec (map parse-line (read-lines test-file-path))))
  ; Basic dataset predictions
  (println (predict training test-data distance/euclidean-distance k))
  ; Prediction on Iris dataset
  (def iris-file-path "data/iris.csv")
  (def iris-data (get-iris-dataset iris-file-path))
  (def iris-predictions (predict iris-data iris-data distance/euclidean-distance k)))