Nixinova / LinguistJS

Analyse and list all languages used in a folder. Implementation of and powered by GitHub's Linguist.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Use Bayes classifier to fix classification of ambiguous files

Nixinova opened this issue · comments

Currently around 200 of github/linguist's samples are misclassified:

{
  'AL/ALIssue.al': 'Perl',
  'AL/ALIssueList.al': 'Perl',
  'AL/RefreshALIssuesCode.al': 'Perl',
  'AMPL/CT2.mod': 'Linux Kernel Module',
  'AngelScript/botmanager.as': 'ActionScript',
  'AngelScript/payload.as': 'ActionScript',
  'Apex/ArrayUtils.cls': 'ObjectScript',
  'Apex/BooleanUtils.cls': 'ObjectScript',
  'Apex/EmailUtils.cls': 'ObjectScript',
  'Apex/GeoUtils.cls': 'ObjectScript',
  'Apex/LanguageUtils.cls': 'ObjectScript',
  'Apex/TwilioAPI.cls': 'ObjectScript',
  'Assembly/audio.i': 'SWIG',
  'Assembly/FASM.asm': 'Motorola 68K Assembly',
  'Assembly/fp_sqr32_160_comba.inc': 'Pascal',
  'Assembly/lib.inc': 'Pascal',
  'Assembly/macros.inc': 'Pascal',
  'Blade/hello.blade.php': 'PHP',
  'Brainfuck/fib100.bf': 'Beef',
  'Brainfuck/hello.bf': 'Beef',
  'Brainfuck/helloworld.bf': 'Beef',
  'Brainfuck/rot13.bf': 'Beef',
  'C++/bar.hh': 'Hack',
  'C++/ClasspathVMSystemProperties.inc': 'Pascal',
  'C++/initClasses.inc': 'Pascal',
  'C++/instances.inc': 'Pascal',
  'Charity/example.ch': 'xBase',
  'CMake/filenames/CMakeLists.txt': 'Text',
  'CoffeeScript/build.cake': 'C#',
  'Component Pascal/Example.cp': 'C++',
  'Cue Sheet/sample1.cue': 'CUE',
  'Cue Sheet/sample2.cue': 'CUE',
  'Eiffel/application.e': 'E',
  'Eiffel/book_collection.e': 'E',
  'Eiffel/git_checkout_command.e': 'E',
  'Formatted/long_seq.for': 'Fortran',
  'Formatted/NiAlH_jea.eam.fs': 'Filterscript',
  'Formatted/wksst8110.for': 'Fortran',
  'Forth/core1.F': 'Filebench WML',
  'Fortran/sample3.F': 'Filebench WML',
  'GAP/PackageInfo.g': 'G-code',
  'HLSL/corridor.fx': 'FLUX',
  'HLSL/jellyfish.fx': 'FLUX',
  'HLSL/noise.fx': 'FLUX',
  'HTML/rpanel.inc': 'Pascal',
  'HTML/tailDel.inc': 'Pascal',
  'HyPhy/AAModelComparison.bf': 'Beef',
  'HyPhy/CodonModelCompare.bf': 'Beef',
  'HyPhy/dNdSDistributionComparison.bf': 'Beef',
  'HyPhy/hyphy_cmds.bf': 'Beef',
  'HyPhy/MatrixIndexing.bf': 'Beef',
  'HyPhy/MFPositiveSelection.bf': 'Beef',
  'HyPhy/MolecularClock.bf': 'Beef',
  'HyPhy/profile_test.bf': 'Beef',
  'INI/ms.cfg': 'HAProxy',
  'Java Properties/libraries.properties': 'INI',
  'JavaScript/chart_composers.gs': 'Genie',
  'JavaScript/intro.js.frag': 'GLSL',
  'JavaScript/itau.gs': 'Genie',
  'JavaScript/merge.js': 'AppleScript',
  'JavaScript/outro.js.frag': 'GLSL',
  'JavaScript+ERB/create.js.erb': 'HTML+ERB',
  'jq/builtin.jq': 'JSONiq',
  'jq/sample.jq': 'JSONiq',
  'KiCad Legacy Layout/tc14badge.brd': 'Eagle',
  'KiCad Schematic/buttons.sch': 'Eagle',
  'KiCad Schematic/buzzer.sch': 'Eagle',
  'KiCad Schematic/ciaaConector.sch': 'Eagle',
  'KiCad Schematic/gedda-junk.sch': 'Eagle',
  'KiCad Schematic/ultimate-temp-controller.sch': 'Eagle',
  'KiCad Schematic/Volume.sch': 'Eagle',
  'Lex/filenames/Lexer.x': 'Linker Script',
  'Limbo/cat.b': 'Brainfuck',
  'Limbo/lock.b': 'Brainfuck',
  'Literate CoffeeScript/pixi.coffee.md': 'Markdown',
  'Logos/string1.x': 'Linker Script',
  'M/arrays.m': 'Limbo',
  'M/fibonacci.m': 'Limbo',
  'M/forloop.m': 'Limbo',
  'M/helloworld.m': 'Limbo',
  'M/ifelse.m': 'Limbo',
  'M/indirectfunctions.m': 'Limbo',
  'M/mileage.m': 'Limbo',
  'M/nesting.m': 'Limbo',
  'Makefile/filenames/Makefile.inc': 'Pascal',
  'Mathematica/HeyexImport.m': 'Limbo',
  'Mathematica/Init.m': 'Limbo',
  'Mathematica/PacletInfo.m': 'Limbo',
  'Mathematica/Predicates.m': 'Limbo',
  'Mathematica/Problem12.m': 'Limbo',
  'MATLAB/Check_plot.m': 'Limbo',
  'MATLAB/matlab_class.m': 'Limbo',
  'MATLAB/normalize.m': 'Limbo',
  'Meson/filenames/meson_options.txt': 'Text',
  'Moocode/hello.moo': 'Mercury',
  'Moocode/moocode_toolkit.moo': 'Mercury',
  'Moocode/toy.moo': 'Mercury',
  'MQL5/Regex.mqh': 'MQL4',
  'NCL/cru_8.ncl': 'Text',
  'NCL/gsn_csm_xy2_time_series_inputs.ncl': 'Text',
  'NCL/hdf4sds_7.ncl': 'Text',
  'NCL/mask_12.ncl': 'Text',
  'NCL/mcsst_1.ncl': 'Text',
  'NCL/primero.ncl': 'Text',
  'NCL/PrnOscPat_driver.ncl': 'Text',
  'NCL/topo_9.ncl': 'Text',
  'NCL/traj_3.ncl': 'Text',
  'NCL/tsdiagram_1.ncl': 'Text',
  'NCL/unique_9.ncl': 'Text',
  'NCL/viewport_4.ncl': 'Text',
  'NCL/weather_sym_6.ncl': 'Text',
  'NCL/WRF_static_2.ncl': 'Text',
  'NCL/WRF_track_1.ncl': 'Text',
  'NCL/xy_29.ncl': 'Text',
  'NetLinx+ERB/sample.axi.erb': 'HTML+ERB',
  'NetLinx+ERB/sample.axs.erb': 'HTML+ERB',
  'Nginx/example.com.vhost': 'ApacheConf',
  'Nim/nimfix.nim.cfg': 'HAProxy',
  'Objective-C/cocoa_monitor.m': 'Limbo',
  'Objective-J/AppController.j': 'Jasmin',
  'Objective-J/iTunesLayout.j': 'Jasmin',
  'Objective-J/LightsOff.j': 'Jasmin',
  'OCaml/common.ml': 'Standard ML',
  'OCaml/date.ml': 'Standard ML',
  'OCaml/sigset.ml': 'Standard ML',
  'OpenEdge ABL/Email.cls': 'ObjectScript',
  'OpenEdge ABL/SendEmailAlgorithm.cls': 'ObjectScript',
  'OpenEdge ABL/Util.cls': 'ObjectScript',
  'Parrot Internal Representation/hello.pir': 'Parrot Assembly',
  'Pawn/Check.inc': 'Pascal',
  'Pawn/fixed.inc': 'Pascal',
  'Pawn/fixes.inc': 'Pascal',
  'Pawn/mfile.inc': 'Pascal',
  'Pawn/y_testing.inc': 'Pascal',
  'Perl/exception_handler.pl': 'Raku',
  'Perl/script.pl': 'Raku',
  'Perl/test-perl.pl': 'Raku',
  'Perl/test-perl2.pl': 'Raku',
  'PHP/prefix.fcgi': 'Lua',
  'QMake/complex.pro': 'IDL',
  'QMake/simple.pro': 'IDL',
  'Raku/htmlify.pl': 'Pod 6',
  'Raku/man-or-boy.t': 'Turing',
  'Raku/test.p6': 'Pod 6',
  'Record Jar/filenames/language-subtag-registry.txt': 'Text',
  "C:/Users/user/Documents/GitHub/`forks/fork_linguist/samples/Ren'Py/example.rpy": "Ren'Py",
  'reStructuredText/HACKING.rst.txt': 'Text',
  'robots.txt/filenames/robots.txt': 'Text',
  'Roff Manpage/gather_profile_stats.man': 'Roff',
  'Roff Manpage/lyxclient.1in': 'Roff',
  'Roff Manpage/pgrep.3p': 'Roff',
  'Roff Manpage/sched_get_priority_min.3x': 'Roff',
  'Roff Manpage/sensor_attach.mdoc': 'Roff',
  'Roff Manpage/sigwait.3qt': 'Roff',
  'Roff Manpage/tan.3m': 'Roff',
  'Roff Manpage/tls_config_ocsp_require_stapling.3in': 'Roff',
  'Roff Manpage/uname.1m': 'Roff',
  'Roff Manpage/URI.3pm': 'Roff',
  'Roff Manpage/zforce.1x': 'Roff',
  'Roff Manpage/zip_file_add.mdoc': 'Roff',
  'RPM Spec/apache.spec': 'Python',
  'RPM Spec/erlang-erlydtl.spec': 'Python',
  'RPM Spec/manos.spec': 'Python',
  'Ruby/any.spec': 'Python',
  'Scala/99-bottles-of-beer': 'Shell',
  'Scala/scala': 'Shell',
  'Scheme/lambdastar.sls': 'SaltStack',
  'Scheme/sboyer.sch': 'Eagle',
  'ShaderLab/DepthOfField.shader': 'GLSL',
  'ShaderLab/Fog.shader': 'GLSL',
  'ShaderLab/Uber.shader': 'GLSL',
  'SQL/AvailableInSearchSel.prc': 'PLSQL',
  'SQL/hostcache_set_state.inc': 'Pascal',
  'SQL/videodb.ddl': 'PLSQL',
  'SRecode Template/linguist.srt': 'SubRip Text',
  'Standard ML/Foo.ML': 'OCaml',
  'SuperCollider/example.scd': 'Markdown',
  'Terra/arith.t': 'Turing',
  'Terra/arrayt.t': 'Turing',
  'Terra/benchmark_nbody.t': 'Turing',
  'Text/aptitude-defaults.nb': 'Mathematica',
  'Text/tutor.nb': 'Mathematica',
  'TI Program/srcalpha.8xp.txt': 'Text',
  'TI Program/srcfunc.8xp.txt': 'Text',
  'TI Program/srcgui.8xp.txt': 'Text',
  'TI Program/srcsort.8xp.txt': 'Text',
  'Unity3D Asset/handFingers.mask': 'Mask',
  'Unix Assembly/gemm_kernel_1x4.S': 'Motorola 68K Assembly',
  'Unix Assembly/hello.ms': 'MAXScript',
  'Unix Assembly/hello.s': 'Motorola 68K Assembly',
  'Unix Assembly/support.S': 'Motorola 68K Assembly',
  'VBA/cApplication.cls': 'ObjectScript',
  'VBA/dictionary.cls': 'ObjectScript',
  'VBA/specs.bas': 'BASIC',
  'XML/battlescribe.gst': 'Gosu',
  'XML/Case.workflow': 'HCL',
  'XML/HITSP_C32.sch': 'Eagle',
  'XML/holobloc-sample.res': 'ReScript',
  'XML/namespace-strict.sch': 'Eagle',
  'XML/oasis-table.sch': 'Eagle',
  'XML/some-ideas.mm': 'Objective-C++',
  'XML/XmlIO.pluginspec': 'Ruby',
  'YAML/database.yml.mysql': 'SQL',
}

A lot of these are because github/linguist uses its own Bayesian classifier on these ambiguous files while linguist-js currently doesn't.

There are a couple npm packages like classificator that may be of use here.

Found a roadblock: unless I were to fetch every sample github-linguist offers, testing against a classifier trained only on samples corresponding to a given language gives the classifier nothing to compare the files to so it can never give a proper rating. I don't think this would be possible while also not taking forever.

Actually, I think it should be possible without using too many web requests if I just load one sample per language. So per heuristic the web requests is the number rof associated languages + 1. Maybe this should be off by default.

Well, loading one sample per language makes the results no better than normal, and it'd take too many requests for this to be worthwhile.