(ns knn.core) | |
(use '[knn.distance :as distance]) (use '[clojure.java.io :as io]) (use '[clojure.string :as string]) | |
(defstruct observation :label :observation) | |
Counter of the value in the vector | (defn- counter [v value] (count (filter (partial == value) v))) |
Labeling schema among observations | (defn- majority-label [observations] (last (sort-by (partial counter (map :label observations)) (set (map :label observations))))) |
Return the closest k nearest neighbors based on distance function | (defn- nearest-neighbors [observation data distance-function k] (take k (sort-by #(distance-function (:observation observation) (:observation %)) data))) |
Pairwise distance matrix for observations | (defn- pairwise-distance-matrix [vectors distance-function] (vec (map #(partial distance-function %) vectors))) |
Predict the example based on training | (defn predict [training test-data distance-function k] (vec (map #(majority-label (nearest-neighbors % training distance-function k)) test-data))) |
Return the file contents in the form of a vector where every line is an element | (defn read-lines [file-path] (with-open [x (io/reader file-path)] (vec (line-seq x)))) |
Parse line into two separate parts | (defn parse-line [line] (#(struct observation (first %) (rest %)) (map #(Float/parseFloat %) (.split line " ")))) |
Parse vector into label and observation | (defn parse-vector [v] (#(struct observation (first %) (rest %)) v)) |
Read csv file | (defn read-csv [file-path delimiter] (with-open [rd (io/reader (io/file file-path))] (->> (line-seq rd) (map #(.split ^String % delimiter)) (mapv vec)))) |
Convert Iris Labels into integer equivalents | (defn- convert-iris-labels [label] (cond (= label "Iris-setosa") 0.0 (= label "Iris-versicolor") 1.0 (= label "Iris-virginica") 2.0)) |
Convert Iris Dataset in the form of label observations | (defn- get-iris-dataset [iris-file-path] (let [iris-dataset (read-csv iris-file-path ",") iris-labels (map convert-iris-labels (map last iris-dataset)) iris-observations (map #(into [] (map bigdec (butlast %))) iris-dataset)] (map parse-vector (map #(into [] %) (map cons iris-labels iris-observations))))) |
Main Function | (defn -main [& args] (def train-file-path "data/train.txt") (def test-file-path "data/test.txt") ; Number of nearest neighbors (def k 5) (def training (vec (map parse-line (read-lines train-file-path)))) (def test-data (vec (map parse-line (read-lines test-file-path)))) ; Basic dataset predictions (println (predict training test-data distance/euclidean-distance k)) ; Prediction on Iris dataset (def iris-file-path "data/iris.csv") (def iris-data (get-iris-dataset iris-file-path)) (def iris-predictions (predict iris-data iris-data distance/euclidean-distance k))) |