Class: Wordlist::Lexer

Inherits:

Object

Object
Wordlist::Lexer

show all

Defined in:: lib/wordlist/lexer.rb,
lib/wordlist/lexer/lang.rb,
lib/wordlist/lexer/stop_words.rb

Overview

Parses arbitrary text and scans each word from it.

Since:

1.0.0

Defined Under Namespace

Modules: Lang, StopWords

Constant Summary collapse

ACRONYM = Regexp to match acronyms. Since: 1.0.0

/[[:alpha:]](?:\.[[:alpha:]])+\./

SPECIAL_CHARS = Default set of punctuation characters allowed within words Since: 1.0.0

%w[_ - ']

Instance Attribute Summary collapse

#ignore_words ⇒ Array<String, Regexp> readonly
#lang ⇒ Symbol readonly
#special_chars ⇒ Array<String> readonly
#stop_words ⇒ Array<String> readonly

Instance Method Summary collapse

#acronyms? ⇒ Boolean
Determines whether acronyms will be parsed or ignored.
#digits? ⇒ Boolean
Determines whether parsed words may contain digits or not.
#initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) ⇒ Lexer constructor
Initializes the lexer.
#normalize_acronyms? ⇒ Boolean
Determines whether . characters will be removed from acronyms.
#normalize_apostrophes? ⇒ Boolean
Determines whether apostrophes will be stripped from words.
#normalize_case? ⇒ Boolean
Determines whether all words will be converted to lowercase.
#numbers? ⇒ Boolean
Determines whether numbers will be parsed or ignored.
#parse(text) {|word| ... } ⇒ Array<String>
Enumerates over each word in the text.

Constructor Details

#initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) ⇒ `Lexer`

Initializes the lexer.

Parameters:

lang (Symbol) (defaults to: Lang.default) —
The language to use. Defaults to Wordlist::Lexer::Lang.default.
stop_words (Array<String>) (defaults to: StopWords[lang]) —
The explicit stop-words to ignore. If not given, default stop words will be loaded based on lang or Wordlist::Lexer::Lang.default.
ignore_words (Array<String, Regexp>) (defaults to: []) —
Optional list of words to ignore. Can contain Strings or Regexps.
digits (Boolean) (defaults to: true) —
Controls whether parsed words may contain digits or not.
special_chars (Array<String>) (defaults to: SPECIAL_CHARS) —
The additional special characters allowed within words.
numbers (Boolean) (defaults to: false) —
Controls whether whole numbers will be parsed as words.
acronyms (Boolean) (defaults to: true) —
Controls whether acronyms will be parsed as words.
normalize_case (Boolean) (defaults to: false) —
Controls whether to convert all words to lowercase.
normalize_apostrophes (Boolean) (defaults to: false) —
Controls whether apostrophes will be removed from the end of words.
normalize_acronyms (Boolean) (defaults to: false) —
Controls whether acronyms will have . characters removed.

Raises:

(ArgumentError) —
The ignore_words keyword contained a value other than a String or Regexp.

Since:

1.0.0

# File 'lib/wordlist/lexer.rb', line 73

def initialize(lang:          Lang.default,
               stop_words:    StopWords[lang],
               ignore_words:  [],
               digits:   true,
               special_chars:  SPECIAL_CHARS,
               numbers:  false,
               acronyms: true,
               normalize_case:        false,
               normalize_apostrophes: false,
               normalize_acronyms:    false)
  @lang          = lang
  @stop_words    = stop_words
  @ignore_words  = ignore_words
  @special_chars = special_chars

  @digits   = digits
  @numbers  = numbers
  @acronyms = acronyms

  @normalize_acronyms    = normalize_acronyms
  @normalize_apostrophes = normalize_apostrophes
  @normalize_case        = normalize_case

  escaped_chars = Regexp.escape(@special_chars.join)

  @word = if @digits
            # allows numeric characters
            /\p{L}(?:[\p{L}\p{Nd}#{escaped_chars}]*[\p{L}\p{Nd}])?/
          else
            # only allows alpha characters
            /\p{L}(?:[\p{L}#{escaped_chars}]*\p{L})?/
          end

  skip_words = Regexp.union(
    (@stop_words + @ignore_words).map { |pattern|
      case pattern
      when Regexp then pattern
      when String then /#{Regexp.escape(pattern)}/i
      else
        raise(ArgumentError,"ignore_words: must contain only Strings or Regexps")
      end
    }
  )

  if @numbers
    # allows lexing whole numbers
    @skip_word   = /(?:#{skip_words}[[:punct:]]*(?:[[:space:]]+|$))+/i
    @word        = /#{@word}|\d+/
    @not_a_word  = /[^\p{L}\d]+/
  else
    # skips whole numbers
    @skip_word   = /(?:(?:#{skip_words}|\d+)[[:punct:]]*(?:[[:space:]]+|$))+/i
    @not_a_word  = /[^\p{L}]+/
  end
end

Instance Attribute Details

#ignore_words ⇒ `Array<String, Regexp>` (readonly)

Returns:

(Array<String, Regexp>)

Since:

1.0.0



30
31
32

# File 'lib/wordlist/lexer.rb', line 30

def ignore_words
  @ignore_words
end

#lang ⇒ `Symbol` (readonly)

Returns:

(Symbol)

Since:

1.0.0



24
25
26

# File 'lib/wordlist/lexer.rb', line 24

def lang
  @lang
end

#special_chars ⇒ `Array<String>` (readonly)

Returns:

(Array<String>)

Since:

1.0.0



33
34
35

# File 'lib/wordlist/lexer.rb', line 33

def special_chars
  @special_chars
end

#stop_words ⇒ `Array<String>` (readonly)

Returns:

(Array<String>)

Since:

1.0.0



27
28
29

# File 'lib/wordlist/lexer.rb', line 27

def stop_words
  @stop_words
end

Instance Method Details

#acronyms? ⇒ `Boolean`

Determines whether acronyms will be parsed or ignored.

Returns:

(Boolean)

Since:

1.0.0



152
153
154

# File 'lib/wordlist/lexer.rb', line 152

def acronyms?
  @acronyms
end

#digits? ⇒ `Boolean`

Determines whether parsed words may contain digits or not.

Returns:

(Boolean)

Since:

1.0.0



134
135
136

# File 'lib/wordlist/lexer.rb', line 134

def digits?
  @digits
end

#normalize_acronyms? ⇒ `Boolean`

Determines whether . characters will be removed from acronyms.

Returns:

(Boolean)

Since:

1.0.0



161
162
163

# File 'lib/wordlist/lexer.rb', line 161

def normalize_acronyms?
  @normalize_acronyms
end

#normalize_apostrophes? ⇒ `Boolean`

Determines whether apostrophes will be stripped from words.

Returns:

(Boolean)

Since:

1.0.0



170
171
172

# File 'lib/wordlist/lexer.rb', line 170

def normalize_apostrophes?
  @normalize_apostrophes
end

#normalize_case? ⇒ `Boolean`

Determines whether all words will be converted to lowercase.

Returns:

(Boolean)

Since:

1.0.0



179
180
181

# File 'lib/wordlist/lexer.rb', line 179

def normalize_case?
  @normalize_case
end

#numbers? ⇒ `Boolean`

Determines whether numbers will be parsed or ignored.

Returns:

(Boolean)

Since:

1.0.0



143
144
145

# File 'lib/wordlist/lexer.rb', line 143

def numbers?
  @numbers
end

#parse(text) {|word| ... } ⇒ `Array<String>`

Enumerates over each word in the text.

Yields:

(word) —
The given block will be passed each word from the text.

Yield Parameters:

word (String) —
A parsed word from the text.

Returns:

(Array<String>) —
If no block is given, an Array of the parsed words will be returned instead.

Since:

1.0.0

# File 'lib/wordlist/lexer.rb', line 196

def parse(text,&block)
  return enum_for(__method__,text).to_a unless block_given?

  scanner = StringScanner.new(text)

  until scanner.eos?
    scanner.skip(@not_a_word)
    scanner.skip(@skip_word)

    if (acronym = scanner.scan(ACRONYM))
      if @acronyms
        acronym.tr!('.','') if @normalize_acronyms

        yield acronym
      end
    elsif (word = scanner.scan(@word))
      word.downcase! if @normalize_case
      word.chomp!("'s") if (@normalize_apostrophes && word.end_with?("'s"))

      yield word
    end
  end
end

Class: Wordlist::Lexer

Overview

Defined Under Namespace

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) ⇒ Lexer

Instance Attribute Details

#ignore_words ⇒ Array<String, Regexp> (readonly)

#lang ⇒ Symbol (readonly)

#special_chars ⇒ Array<String> (readonly)

#stop_words ⇒ Array<String> (readonly)

Instance Method Details

#acronyms? ⇒ Boolean

#digits? ⇒ Boolean

#normalize_acronyms? ⇒ Boolean

#normalize_apostrophes? ⇒ Boolean

#normalize_case? ⇒ Boolean

#numbers? ⇒ Boolean

#parse(text) {|word| ... } ⇒ Array<String>

#initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) ⇒ `Lexer`

#ignore_words ⇒ `Array<String, Regexp>` (readonly)

#lang ⇒ `Symbol` (readonly)

#special_chars ⇒ `Array<String>` (readonly)

#stop_words ⇒ `Array<String>` (readonly)

#acronyms? ⇒ `Boolean`

#digits? ⇒ `Boolean`

#normalize_acronyms? ⇒ `Boolean`

#normalize_apostrophes? ⇒ `Boolean`

#normalize_case? ⇒ `Boolean`

#numbers? ⇒ `Boolean`

#parse(text) {|word| ... } ⇒ `Array<String>`