Module: Linguist::Samples

Defined in:
lib/linguist/samples.rb

Overview

Model for accessing classifier training data.

Constant Summary collapse

ROOT =

Path to samples root directory

File.expand_path("../../../samples", __FILE__)
PATH =

Path for serialized samples db

File.expand_path('../samples.json', __FILE__)

Class Method Summary collapse

Class Method Details

.cacheObject

Hash of serialized samples object, cached in memory



21
22
23
# File 'lib/linguist/samples.rb', line 21

def self.cache
  @cache ||= load_samples
end

.dataObject

Public: Build Classifier from all samples.

Returns trained Classifier.



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/linguist/samples.rb', line 81

def self.data
  db = {}
  db['extnames'] = {}
  db['interpreters'] = {}
  db['filenames'] = {}

  each do |sample|
    language_name = sample[:language]

    if sample[:extname]
      db['extnames'][language_name] ||= []
      if !db['extnames'][language_name].include?(sample[:extname])
        db['extnames'][language_name] << sample[:extname]
        db['extnames'][language_name].sort!
      end
    end

    if sample[:interpreter]
      db['interpreters'][language_name] ||= []
      if !db['interpreters'][language_name].include?(sample[:interpreter])
        db['interpreters'][language_name] << sample[:interpreter]
        db['interpreters'][language_name].sort!
      end
    end

    if sample[:filename]
      db['filenames'][language_name] ||= []
      db['filenames'][language_name] << sample[:filename]
      db['filenames'][language_name].sort!
    end

    data = File.read(sample[:path])
    Classifier.train!(db, language_name, data)
  end

  Classifier.finalize_train! db
  db['sha256'] = Linguist::SHA256.hexdigest(db)

  db
end

.each(&block) ⇒ Object

Public: Iterate over each sample.

&block - Yields Sample to block

Returns nothing.



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/linguist/samples.rb', line 43

def self.each(&block)
  Dir.entries(ROOT).sort!.each do |category|
    next if category == '.' || category == '..'

    dirname = File.join(ROOT, category)
    Dir.entries(dirname).each do |filename|
      next if filename == '.' || filename == '..'

      if filename == 'filenames'
        Dir.entries(File.join(dirname, filename)).each do |subfilename|
          next if subfilename == '.' || subfilename == '..'

          yield({
            :path    => File.join(dirname, filename, subfilename),
            :language => category,
            :filename => subfilename
          })
        end
      else
        path = File.join(dirname, filename)
        extname = File.extname(filename)

        yield({
          :path     => path,
          :language => category,
          :interpreter => Shebang.interpreter(File.read(path)),
          :extname  => extname.empty? ? nil : extname
        })
      end
    end
  end

  nil
end

.load_samplesObject

Hash of serialized samples object, uncached



26
27
28
29
30
31
32
33
34
35
36
# File 'lib/linguist/samples.rb', line 26

def self.load_samples
  serializer = defined?(Yajl) ? Yajl : JSON
  data = serializer.load(File.read(PATH, encoding: 'utf-8'))
  # JSON serialization does not allow integer keys, we fix them here
  for lang in data['centroids'].keys
    fixed = data['centroids'][lang].to_a.map { |k,v| [k.to_i, v] }
    data['centroids'][lang] = Hash[fixed]
  end

  data
end