Class: Ve::Parse::FreelingEn
- Defined in:
- lib/providers/freeling_en.rb
Constant Summary collapse
- INTERNAL_INFO_FOR_PARSED_POS =
{ 'CC' => [Ve::PartOfSpeech::Conjunction, nil], 'CD' => [Ve::PartOfSpeech::Number, nil], 'DT' => [Ve::PartOfSpeech::Determiner, nil], 'EX' => [Ve::PartOfSpeech::Pronoun, nil], 'FW' => [Ve::PartOfSpeech::Unknown, nil], 'IN' => [Ve::PartOfSpeech::Preposition, nil], 'JJ' => [Ve::PartOfSpeech::Adjective, nil], 'JJR' => [Ve::PartOfSpeech::Adjective, :comparative], 'JJS' => [Ve::PartOfSpeech::Adjective, :superlative], 'LS' => [Ve::PartOfSpeech::Unknown, nil], 'MD' => [Ve::PartOfSpeech::Verb, :modal], 'NN' => [Ve::PartOfSpeech::Noun, nil], 'NNS' => [Ve::PartOfSpeech::Noun, :plural], 'NNP' => [Ve::PartOfSpeech::ProperNoun, nil], 'NNPS' => [Ve::PartOfSpeech::ProperNoun, :plural], 'PDT' => [Ve::PartOfSpeech::Determiner, nil], 'PRP' => [Ve::PartOfSpeech::Pronoun, :personal], 'PRP$' => [Ve::PartOfSpeech::Pronoun, :possessive], 'RB' => [Ve::PartOfSpeech::Adverb, nil], 'RBR' => [Ve::PartOfSpeech::Adverb, :comparative], 'RBS' => [Ve::PartOfSpeech::Adverb, :superlative], 'RP' => [Ve::PartOfSpeech::Postposition, nil], 'SYM' => [Ve::PartOfSpeech::Symbol, nil], 'TO' => [Ve::PartOfSpeech::Preposition, nil], 'UH' => [Ve::PartOfSpeech::Interjection, nil], 'VB' => [Ve::PartOfSpeech::Verb, nil], 'VBD' => [Ve::PartOfSpeech::Verb, :past], 'VBG' => [Ve::PartOfSpeech::Verb, :present_participle], 'VBN' => [Ve::PartOfSpeech::Verb, :past_participle], 'VBP' => [Ve::PartOfSpeech::Verb, nil], 'VBZ' => [Ve::PartOfSpeech::Verb, nil], 'WDT' => [Ve::PartOfSpeech::Determiner, nil], 'WP' => [Ve::PartOfSpeech::Pronoun, nil], 'WP$' => [Ve::PartOfSpeech::Pronoun, :possessive], 'WRB' => [Ve::PartOfSpeech::Adverb, nil], 'Z' => [Ve::PartOfSpeech::Determiner, nil] }
Instance Attribute Summary collapse
-
#text ⇒ Object
readonly
Returns the value of attribute text.
-
#tokens ⇒ Object
readonly
Returns the value of attribute tokens.
Instance Method Summary collapse
-
#initialize(text, output) ⇒ FreelingEn
constructor
A new instance of FreelingEn.
- #sentences ⇒ Object
- #words ⇒ Object
Methods inherited from Ve::Parse
Constructor Details
#initialize(text, output) ⇒ FreelingEn
Returns a new instance of FreelingEn.
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/providers/freeling_en.rb', line 78 def initialize(text, output) @tokens = [] @text = text position = 0 output.each_with_index do |line, index| line.rstrip! token = {:raw => line} # Anything unparsed at the end of the text # This must happen before sentence splits are detected to avoid funny ordering if output.length > 1 && output.length == index + 1 unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position) if unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token end end # Sentence splits are just empty lines in Freeling if line.length == 0 token[:type] = :sentence_split token[:literal] = '' @tokens << token next end # The parsed token info = line.split(/\s+/) token[:type] = :parsed [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i| token[attr] = info[i] end token[:literal].gsub!('_', ' ') token[:lemma].gsub!('_', ' ') # Anything unparsed preceding this token. # We need to do this complicated dance with _ since Freeling replaces spaces with it. # And so we need to be able to find the token with both spaces and _ in it since # we don't know what the original in the text actually is. # Once we have the location in the text we can figure out if it should be with spaces or _. unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}} unparsed_md = unparsed_re.match(text, position) if unparsed_md && unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token position += unparsed_token[:literal].length end token[:characters] = (position..(position+token[:literal].length-1)) position += token[:literal].length @tokens << token end end |
Instance Attribute Details
#text ⇒ Object (readonly)
Returns the value of attribute text.
76 77 78 |
# File 'lib/providers/freeling_en.rb', line 76 def text @text end |
#tokens ⇒ Object (readonly)
Returns the value of attribute tokens.
76 77 78 |
# File 'lib/providers/freeling_en.rb', line 76 def tokens @tokens end |
Instance Method Details
#sentences ⇒ Object
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
# File 'lib/providers/freeling_en.rb', line 204 def sentences sentences = [] current = '' @tokens.each do |token| if token[:type] == :sentence_split sentences << current current = '' else current << token[:literal] end end # In case there is no :sentence_split at the end sentences << current if current.length > 0 sentences.collect { |s| s.strip! } sentences end |
#words ⇒ Object
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/providers/freeling_en.rb', line 178 def words words = [] @tokens.find_all { |t| t[:type] == :parsed }.each do |token| if token[:pos] == 'POS' # Possessive ending, add to previous token words[-1].word << token[:literal] words[-1].tokens << token next else # All other tokens pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]] if pos.nil? && token[:pos] =~ /^F\w+$/ pos = Ve::PartOfSpeech::Symbol end pos = Ve::PartOfSpeech::TBD if pos.nil? word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar}) words << word end end words end |