Class: Wordlist::Lexer

Inherits:
Object
  • Object
show all
Defined in:
lib/wordlist/lexer.rb,
lib/wordlist/lexer/lang.rb,
lib/wordlist/lexer/stop_words.rb

Overview

Parses arbitrary text and scans each word from it.

Since:

  • 1.0.0

Defined Under Namespace

Modules: Lang, StopWords

Constant Summary collapse

ACRONYM =

Regexp to match acronyms.

Since:

  • 1.0.0

/[[:alpha:]](?:\.[[:alpha:]])+\./
SPECIAL_CHARS =

Default set of punctuation characters allowed within words

Since:

  • 1.0.0

%w[_ - ']

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) ⇒ Lexer

Initializes the lexer.

Parameters:

  • lang (Symbol) (defaults to: Lang.default)

    The language to use. Defaults to Wordlist::Lexer::Lang.default.

  • stop_words (Array<String>) (defaults to: StopWords[lang])

    The explicit stop-words to ignore. If not given, default stop words will be loaded based on lang or Wordlist::Lexer::Lang.default.

  • ignore_words (Array<String, Regexp>) (defaults to: [])

    Optional list of words to ignore. Can contain Strings or Regexps.

  • digits (Boolean) (defaults to: true)

    Controls whether parsed words may contain digits or not.

  • special_chars (Array<String>) (defaults to: SPECIAL_CHARS)

    The additional special characters allowed within words.

  • numbers (Boolean) (defaults to: false)

    Controls whether whole numbers will be parsed as words.

  • acronyms (Boolean) (defaults to: true)

    Controls whether acronyms will be parsed as words.

  • normalize_case (Boolean) (defaults to: false)

    Controls whether to convert all words to lowercase.

  • normalize_apostrophes (Boolean) (defaults to: false)

    Controls whether apostrophes will be removed from the end of words.

  • normalize_acronyms (Boolean) (defaults to: false)

    Controls whether acronyms will have . characters removed.

Raises:

  • (ArgumentError)

    The ignore_words keyword contained a value other than a String or Regexp.

Since:

  • 1.0.0



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/wordlist/lexer.rb', line 73

def initialize(lang:          Lang.default,
               stop_words:    StopWords[lang],
               ignore_words:  [],
               digits:   true,
               special_chars:  SPECIAL_CHARS,
               numbers:  false,
               acronyms: true,
               normalize_case:        false,
               normalize_apostrophes: false,
               normalize_acronyms:    false)
  @lang          = lang
  @stop_words    = stop_words
  @ignore_words  = ignore_words
  @special_chars = special_chars

  @digits   = digits
  @numbers  = numbers
  @acronyms = acronyms

  @normalize_acronyms    = normalize_acronyms
  @normalize_apostrophes = normalize_apostrophes
  @normalize_case        = normalize_case

  escaped_chars = Regexp.escape(@special_chars.join)

  @word = if @digits
            # allows numeric characters
            /\p{L}(?:[\p{L}\p{Nd}#{escaped_chars}]*[\p{L}\p{Nd}])?/
          else
            # only allows alpha characters
            /\p{L}(?:[\p{L}#{escaped_chars}]*\p{L})?/
          end

  skip_words = Regexp.union(
    (@stop_words + @ignore_words).map { |pattern|
      case pattern
      when Regexp then pattern
      when String then /#{Regexp.escape(pattern)}/i
      else
        raise(ArgumentError,"ignore_words: must contain only Strings or Regexps")
      end
    }
  )

  if @numbers
    # allows lexing whole numbers
    @skip_word   = /(?:#{skip_words}[[:punct:]]*(?:[[:space:]]+|$))+/i
    @word        = /#{@word}|\d+/
    @not_a_word  = /[^\p{L}\d]+/
  else
    # skips whole numbers
    @skip_word   = /(?:(?:#{skip_words}|\d+)[[:punct:]]*(?:[[:space:]]+|$))+/i
    @not_a_word  = /[^\p{L}]+/
  end
end

Instance Attribute Details

#ignore_wordsArray<String, Regexp> (readonly)

Returns:

  • (Array<String, Regexp>)

Since:

  • 1.0.0



30
31
32
# File 'lib/wordlist/lexer.rb', line 30

def ignore_words
  @ignore_words
end

#langSymbol (readonly)

Returns:

  • (Symbol)

Since:

  • 1.0.0



24
25
26
# File 'lib/wordlist/lexer.rb', line 24

def lang
  @lang
end

#special_charsArray<String> (readonly)

Returns:

  • (Array<String>)

Since:

  • 1.0.0



33
34
35
# File 'lib/wordlist/lexer.rb', line 33

def special_chars
  @special_chars
end

#stop_wordsArray<String> (readonly)

Returns:

  • (Array<String>)

Since:

  • 1.0.0



27
28
29
# File 'lib/wordlist/lexer.rb', line 27

def stop_words
  @stop_words
end

Instance Method Details

#acronyms?Boolean

Determines whether acronyms will be parsed or ignored.

Returns:

  • (Boolean)

Since:

  • 1.0.0



152
153
154
# File 'lib/wordlist/lexer.rb', line 152

def acronyms?
  @acronyms
end

#digits?Boolean

Determines whether parsed words may contain digits or not.

Returns:

  • (Boolean)

Since:

  • 1.0.0



134
135
136
# File 'lib/wordlist/lexer.rb', line 134

def digits?
  @digits
end

#normalize_acronyms?Boolean

Determines whether . characters will be removed from acronyms.

Returns:

  • (Boolean)

Since:

  • 1.0.0



161
162
163
# File 'lib/wordlist/lexer.rb', line 161

def normalize_acronyms?
  @normalize_acronyms
end

#normalize_apostrophes?Boolean

Determines whether apostrophes will be stripped from words.

Returns:

  • (Boolean)

Since:

  • 1.0.0



170
171
172
# File 'lib/wordlist/lexer.rb', line 170

def normalize_apostrophes?
  @normalize_apostrophes
end

#normalize_case?Boolean

Determines whether all words will be converted to lowercase.

Returns:

  • (Boolean)

Since:

  • 1.0.0



179
180
181
# File 'lib/wordlist/lexer.rb', line 179

def normalize_case?
  @normalize_case
end

#numbers?Boolean

Determines whether numbers will be parsed or ignored.

Returns:

  • (Boolean)

Since:

  • 1.0.0



143
144
145
# File 'lib/wordlist/lexer.rb', line 143

def numbers?
  @numbers
end

#parse(text) {|word| ... } ⇒ Array<String>

Enumerates over each word in the text.

Yields:

  • (word)

    The given block will be passed each word from the text.

Yield Parameters:

  • word (String)

    A parsed word from the text.

Returns:

  • (Array<String>)

    If no block is given, an Array of the parsed words will be returned instead.

Since:

  • 1.0.0



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/wordlist/lexer.rb', line 196

def parse(text,&block)
  return enum_for(__method__,text).to_a unless block_given?

  scanner = StringScanner.new(text)

  until scanner.eos?
    scanner.skip(@not_a_word)
    scanner.skip(@skip_word)

    if (acronym = scanner.scan(ACRONYM))
      if @acronyms
        acronym.tr!('.','') if @normalize_acronyms

        yield acronym
      end
    elsif (word = scanner.scan(@word))
      word.downcase! if @normalize_case
      word.chomp!("'s") if (@normalize_apostrophes && word.end_with?("'s"))

      yield word
    end
  end
end