near duplicate detection (using token frequency weighting to dictate ngram contribution to similarity) 1. sql to dump pois (pois.tsv) sql> select p.id as poi_id, pl.name as poi_name, places.id as place_id sql> from pois p join places on p.place_id=places.id sql> join poi_localisations pl on pl.poi_id=p.id (or hit public api) 2. sql to dump places (places.tsv) sql> select id,name,full_name from places (or hit public api) 2.5 cleanup last run $ rm *out 3. split into 4 files $ ./split_based_on_places.rb < pois.tsv 4. generate tokens hash $ echo | ./generate_tokens_freq_lookup.rb # for dup ver v1; ie all ngrams worth 1 $ cut -f2 pois.tsv | ./generate_tokens_freq_lookup.rb # for dup ver v2; ngrams weighted 5. process cat pois.p0.out | ./near_dups_v2.rb > near_dups_v2.p0.out & cat pois.p1.out | ./near_dups_v2.rb > near_dups_v2.p1.out & cat pois.p2.out | ./near_dups_v2.rb > near_dups_v2.p2.out & cat pois.p3.out | ./near_dups_v2.rb > near_dups_v2.p3.out & wait sort -n near_dups_v2.p*.out > near_dups_v2.out resemblance poi_id_1 poi_id_2 0.621121695897491 365107 365533 0.794606978416781 365343 365487 0.643269950967987 364899 364911 6. run connected components analysis $ cat near_dups_v2.out | ./connected_components.rb > connected_components.out group_idx poi_id 0 364343 0 364265 1 364817 1 364823 7. build false positive exclusion list $ ./extract_false_positives.rb < false_positives_from_cam.csv > false_positives.tsv 8. make report $ ./reportify.rb