-- K-Means In SQL -- INITIALIZING DATA: Arbitarily (reads: randomly or sequentially, depending on the implementation) selecting v_k nodes from km_data, labeling them in sequential order, also labeling them as "iteration #0", and save to km_clusters. INSERT INTO km_clusters(iter, id, lat, lon) SELECT 0, auto_incr(), lat, lon, 0 FROM km_data LIMIT v_k; -- CHOOSING NEAREST CENTROID FOR EVERY NODE: INSERT INTO km_data(iter, id, cluster_id, lat, lon) SELECT iter+1 AS iter, -- incrementing iteration ID d.id, c.cluster_id, c.lat, c.lon FROM km_clusters c, km_data d WHERE c.iter=i AND d.iter=i -- i is the pseudo-variable for iteration ID. AND c.id=d.id -- ??????? ORDER BY POW(d.lat-c.lat, 2)+POW(d.lon-c.lon, 2) ASC LIMIT 1; -- UPDATE CENTROIDS: INSERT INTO km_clusters(iter, id, lat, lon) SELECT i+1 AS iter, d.cluster_id, AVG(lat) AS lat, AVG(lon) as lon FROM km_clusters c, km_data d WHERE d.iter=i AND c.iter=i AND c.cluster_id=d.id GROUP BY cluster_id; -- A more incremental version -- -- Notice how we are not recording iterations here since we are not keeping history values. -- INITIALIZE CENTROIDS: INSERT INTO km_clusters(lat, lng, iter) SELECT lat, lng, 0 FROM km_data LIMIT v_k; REPEAT -- ASSIGN CLUSTERS TO DATAPOINTS: -- ASSIGN NEAREST CENTROID TO EACH NODE: UPDATE km_data d SET cluster_id = ( SELECT id FROM km_clusters c ORDER BY POW(d.lat-c.lat, 2)+POW(d.lon-c.lon, 2) ASC LIMIT 1); -- CALCULATE NEW CENTROID: UPDATE km_clusters C, ( SELECT cluster_id, AVG(lat) AS lat, AVG(lon) as lon FROM km_data GROUP BY cluster_id) D SET C.lat=D.lat, C.lng=D.lng WHERE C.id=D.id; UNTIL ROW_COUNT()=0 END REPEAT;