Class: Wordlist::Lexer
- Inherits:
-
Object
- Object
- Wordlist::Lexer
- Defined in:
- lib/wordlist/lexer.rb,
lib/wordlist/lexer/lang.rb,
lib/wordlist/lexer/stop_words.rb
Overview
Parses arbitrary text and scans each word from it.
Defined Under Namespace
Constant Summary collapse
- ACRONYM =
Regexp to match acronyms.
/[[:alpha:]](?:\.[[:alpha:]])+\./
- SPECIAL_CHARS =
Default set of punctuation characters allowed within words
%w[_ - ']
Instance Attribute Summary collapse
- #ignore_words ⇒ Array<String, Regexp> readonly
- #lang ⇒ Symbol readonly
- #special_chars ⇒ Array<String> readonly
- #stop_words ⇒ Array<String> readonly
Instance Method Summary collapse
-
#acronyms? ⇒ Boolean
Determines whether acronyms will be parsed or ignored.
-
#digits? ⇒ Boolean
Determines whether parsed words may contain digits or not.
-
#initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) ⇒ Lexer
constructor
Initializes the lexer.
-
#normalize_acronyms? ⇒ Boolean
Determines whether
.
characters will be removed from acronyms. -
#normalize_apostrophes? ⇒ Boolean
Determines whether apostrophes will be stripped from words.
-
#normalize_case? ⇒ Boolean
Determines whether all words will be converted to lowercase.
-
#numbers? ⇒ Boolean
Determines whether numbers will be parsed or ignored.
-
#parse(text) {|word| ... } ⇒ Array<String>
Enumerates over each word in the text.
Constructor Details
#initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) ⇒ Lexer
Initializes the lexer.
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/wordlist/lexer.rb', line 73 def initialize(lang: Lang.default, stop_words: StopWords[lang], ignore_words: [], digits: true, special_chars: SPECIAL_CHARS, numbers: false, acronyms: true, normalize_case: false, normalize_apostrophes: false, normalize_acronyms: false) @lang = lang @stop_words = stop_words @ignore_words = ignore_words @special_chars = special_chars @digits = digits @numbers = numbers @acronyms = acronyms @normalize_acronyms = normalize_acronyms @normalize_apostrophes = normalize_apostrophes @normalize_case = normalize_case escaped_chars = Regexp.escape(@special_chars.join) @word = if @digits # allows numeric characters /\p{L}(?:[\p{L}\p{Nd}#{escaped_chars}]*[\p{L}\p{Nd}])?/ else # only allows alpha characters /\p{L}(?:[\p{L}#{escaped_chars}]*\p{L})?/ end skip_words = Regexp.union( (@stop_words + @ignore_words).map { |pattern| case pattern when Regexp then pattern when String then /#{Regexp.escape(pattern)}/i else raise(ArgumentError,"ignore_words: must contain only Strings or Regexps") end } ) if @numbers # allows lexing whole numbers @skip_word = /(?:#{skip_words}[[:punct:]]*(?:[[:space:]]+|$))+/i @word = /#{@word}|\d+/ @not_a_word = /[^\p{L}\d]+/ else # skips whole numbers @skip_word = /(?:(?:#{skip_words}|\d+)[[:punct:]]*(?:[[:space:]]+|$))+/i @not_a_word = /[^\p{L}]+/ end end |
Instance Attribute Details
#ignore_words ⇒ Array<String, Regexp> (readonly)
30 31 32 |
# File 'lib/wordlist/lexer.rb', line 30 def ignore_words @ignore_words end |
#lang ⇒ Symbol (readonly)
24 25 26 |
# File 'lib/wordlist/lexer.rb', line 24 def lang @lang end |
#special_chars ⇒ Array<String> (readonly)
33 34 35 |
# File 'lib/wordlist/lexer.rb', line 33 def special_chars @special_chars end |
#stop_words ⇒ Array<String> (readonly)
27 28 29 |
# File 'lib/wordlist/lexer.rb', line 27 def stop_words @stop_words end |
Instance Method Details
#acronyms? ⇒ Boolean
Determines whether acronyms will be parsed or ignored.
152 153 154 |
# File 'lib/wordlist/lexer.rb', line 152 def acronyms? @acronyms end |
#digits? ⇒ Boolean
Determines whether parsed words may contain digits or not.
134 135 136 |
# File 'lib/wordlist/lexer.rb', line 134 def digits? @digits end |
#normalize_acronyms? ⇒ Boolean
Determines whether .
characters will be removed from acronyms.
161 162 163 |
# File 'lib/wordlist/lexer.rb', line 161 def normalize_acronyms? @normalize_acronyms end |
#normalize_apostrophes? ⇒ Boolean
Determines whether apostrophes will be stripped from words.
170 171 172 |
# File 'lib/wordlist/lexer.rb', line 170 def normalize_apostrophes? @normalize_apostrophes end |
#normalize_case? ⇒ Boolean
Determines whether all words will be converted to lowercase.
179 180 181 |
# File 'lib/wordlist/lexer.rb', line 179 def normalize_case? @normalize_case end |
#numbers? ⇒ Boolean
Determines whether numbers will be parsed or ignored.
143 144 145 |
# File 'lib/wordlist/lexer.rb', line 143 def numbers? @numbers end |
#parse(text) {|word| ... } ⇒ Array<String>
Enumerates over each word in the text.
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/wordlist/lexer.rb', line 196 def parse(text,&block) return enum_for(__method__,text).to_a unless block_given? scanner = StringScanner.new(text) until scanner.eos? scanner.skip(@not_a_word) scanner.skip(@skip_word) if (acronym = scanner.scan(ACRONYM)) if @acronyms acronym.tr!('.','') if @normalize_acronyms yield acronym end elsif (word = scanner.scan(@word)) word.downcase! if @normalize_case word.chomp!("'s") if (@normalize_apostrophes && word.end_with?("'s")) yield word end end end |