Class: Ve::Parse::FreelingEn

Inherits:
Ve::Parse show all
Defined in:
lib/providers/freeling_en.rb

Constant Summary collapse

INTERNAL_INFO_FOR_PARSED_POS =
{
  'CC' => [Ve::PartOfSpeech::Conjunction, nil],
  'CD' => [Ve::PartOfSpeech::Number, nil],
  'DT' => [Ve::PartOfSpeech::Determiner, nil],
  'EX' => [Ve::PartOfSpeech::Pronoun, nil],
  'FW' => [Ve::PartOfSpeech::Unknown, nil],
  'IN' => [Ve::PartOfSpeech::Preposition, nil],
  'JJ' => [Ve::PartOfSpeech::Adjective, nil],
  'JJR' => [Ve::PartOfSpeech::Adjective, :comparative],
  'JJS' => [Ve::PartOfSpeech::Adjective, :superlative],
  'LS' => [Ve::PartOfSpeech::Unknown, nil],
  'MD' => [Ve::PartOfSpeech::Verb, :modal],
  'NN' => [Ve::PartOfSpeech::Noun, nil],
  'NNS' => [Ve::PartOfSpeech::Noun, :plural],
  'NNP' => [Ve::PartOfSpeech::ProperNoun, nil],
  'NNPS' => [Ve::PartOfSpeech::ProperNoun, :plural],
  'PDT' => [Ve::PartOfSpeech::Determiner, nil],
  'PRP' => [Ve::PartOfSpeech::Pronoun, :personal],
  'PRP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
  'RB' => [Ve::PartOfSpeech::Adverb, nil],
  'RBR' => [Ve::PartOfSpeech::Adverb, :comparative],
  'RBS' => [Ve::PartOfSpeech::Adverb, :superlative],
  'RP' => [Ve::PartOfSpeech::Postposition, nil],
  'SYM' => [Ve::PartOfSpeech::Symbol, nil],
  'TO' => [Ve::PartOfSpeech::Preposition, nil],
  'UH' => [Ve::PartOfSpeech::Interjection, nil],
  'VB' => [Ve::PartOfSpeech::Verb, nil],
  'VBD' => [Ve::PartOfSpeech::Verb, :past],
  'VBG' => [Ve::PartOfSpeech::Verb, :present_participle],
  'VBN' => [Ve::PartOfSpeech::Verb, :past_participle],
  'VBP' => [Ve::PartOfSpeech::Verb, nil],
  'VBZ' => [Ve::PartOfSpeech::Verb, nil],
  'WDT' => [Ve::PartOfSpeech::Determiner, nil],
  'WP' => [Ve::PartOfSpeech::Pronoun, nil],
  'WP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
  'WRB' => [Ve::PartOfSpeech::Adverb, nil],
  'Z' => [Ve::PartOfSpeech::Determiner, nil]
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Ve::Parse

#as_json

Constructor Details

#initialize(text, output) ⇒ FreelingEn

Returns a new instance of FreelingEn.



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/providers/freeling_en.rb', line 80

def initialize(text, output)
  @tokens = []
  @text = text
  position = 0
  
  output.each_with_index do |line, index|
    line.rstrip!
    token = {:raw => line}

    # Anything unparsed at the end of the text
    # This must happen before sentence splits are detected to avoid funny ordering
    if output.length > 1 && output.length == index + 1
      unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed,
                          :literal => unparsed_md[1],
                          :raw => ''}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
      end
    end
      
    # Sentence splits are just empty lines in Freeling
    if line.length == 0
      token[:type] = :sentence_split
      token[:literal] = ''
      @tokens << token
      next
    end
    
    # The parsed token
    info = line.split(/\s+/)
    token[:type] = :parsed
    [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i|
      token[attr] = info[i]
    end

    token[:literal].gsub!('_', ' ')
    token[:lemma].gsub!('_', ' ')
    
    # Anything unparsed preceding this token.
    # We need to do this complicated dance with _ since Freeling replaces spaces with it.
    # And so we need to be able to find the token with both spaces and _ in it since
    # we don't know what the original in the text actually is.
    # Once we have the location in the text we can figure out if it should be with spaces or _.
    unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx
    unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}}
    unparsed_md = unparsed_re.match(text, position)
    if unparsed_md && unparsed_md[1].length > 0
      unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
      unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
      @tokens << unparsed_token
      position += unparsed_token[:literal].length
    end

    token[:characters] = (position..(position+token[:literal].length-1))
    position += token[:literal].length
    @tokens << token
  end
end

Instance Attribute Details

#textObject (readonly)

Returns the value of attribute text.



78
79
80
# File 'lib/providers/freeling_en.rb', line 78

def text
  @text
end

#tokensObject (readonly)

Returns the value of attribute tokens.



78
79
80
# File 'lib/providers/freeling_en.rb', line 78

def tokens
  @tokens
end

Instance Method Details

#sentencesObject



206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/providers/freeling_en.rb', line 206

def sentences
  sentences = []
  current = ''
  
  @tokens.each do |token|
    if token[:type] == :sentence_split
      sentences << current
      current = ''
    else
      current << token[:literal]
    end
  end
  
  # In case there is no :sentence_split at the end
  sentences << current if current.length > 0

  sentences.collect { |s| s.strip! }
  sentences
end

#wordsObject



180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# File 'lib/providers/freeling_en.rb', line 180

def words
  words = []
  
  @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
    if token[:pos] == 'POS'
      # Possessive ending, add to previous token
      words[-1].word << token[:literal]
      words[-1].tokens << token
      next
    else
      # All other tokens
      pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]]

      if pos.nil? && token[:pos] =~ /^F\w+$/
        pos = Ve::PartOfSpeech::Symbol
      end

      pos = Ve::PartOfSpeech::TBD if pos.nil?
      word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar})
      words << word
    end
  end
  
  words
end