Class: AnyStyle::Feature::Words

Inherits:
AnyStyle::Feature show all
Defined in:
lib/anystyle/feature/words.rb

Constant Summary collapse

TITLE_WORDS =
%w{
  abstract
  acknowledgements
  appendix
  bibliography
  bibliographie
  chapter
  cited
  contents
  figures
  introduction
  kurzfassung
  literatur
  literature
  references
  referenzen
  secondary
  section
  sources
  summary
  tables
  works
}

Instance Attribute Summary collapse

Attributes inherited from AnyStyle::Feature

#precision

Instance Method Summary collapse

Methods inherited from AnyStyle::Feature

#next, #prev, #ratio

Methods included from StringUtils

canonize, count, display_chars, display_width, indent, nnum, page_break?, scrub, strip_html, transliterate

Constructor Details

#initialize(dictionary:, **opts) ⇒ Words

Returns a new instance of Words.


30
31
32
33
# File 'lib/anystyle/feature/words.rb', line 30

def initialize(dictionary:, **opts)
  super(**opts)
  @dictionary = dictionary
end

Instance Attribute Details

#dictionaryObject (readonly)

Returns the value of attribute dictionary


4
5
6
# File 'lib/anystyle/feature/words.rb', line 4

def dictionary
  @dictionary
end

Instance Method Details

#classify(word) ⇒ Object


64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/anystyle/feature/words.rb', line 64

def classify(word)
  case word
  when /^(\d+|[vx]?iii?|i?[vx]|)$/i
    :number
  when /\d/
    :numeric
  when nil
    :none
  else
    :alpha
  end
end

#observe(token, **opts) ⇒ Object


35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/anystyle/feature/words.rb', line 35

def observe(token, **opts)
  words = token.scan(/\S+/).map { |word| canonize word }.reject(&:empty?)
  spacers = token.scan(/\S\s\s+\S/)
  numbers = token.scan(/\d+(\.\d+)?/)
  title = words.count { |word| TITLE_WORDS.include?(word) }
  counts = dictionary.tag_counts(words)

  if words.length > 0
    len = words.map(&:length).sort
    avg = len.reduce(0, :+) / len.length
    med = len.length.even? ?
      len[(len.length - 1) / 2, 2].reduce(0, :+) / 2 :
      len[len.length / 2]
  else
    avg, med = 0, 0
  end

  [
    words.length,
    avg,
    med,
    spacers.length,
    classify(words[0]),
    numbers.length,
    ratio(title, words.length),
    *counts.map { |cnt| ratio(cnt, words.length) }
  ]
end