Class: Ve::Parse::FreelingEn

Inherits:

Ve::Parse

Object
Ve::Parse
Ve::Parse::FreelingEn

Defined in:: lib/providers/freeling_en.rb

Constant Summary collapse

INTERNAL_INFO_FOR_PARSED_POS =

{
  'CC' => [Ve::PartOfSpeech::Conjunction, nil],
  'CD' => [Ve::PartOfSpeech::Number, nil],
  'DT' => [Ve::PartOfSpeech::Determiner, nil],
  'EX' => [Ve::PartOfSpeech::Pronoun, nil],
  'FW' => [Ve::PartOfSpeech::Unknown, nil],
  'IN' => [Ve::PartOfSpeech::Preposition, nil],
  'JJ' => [Ve::PartOfSpeech::Adjective, nil],
  'JJR' => [Ve::PartOfSpeech::Adjective, :comparative],
  'JJS' => [Ve::PartOfSpeech::Adjective, :superlative],
  'LS' => [Ve::PartOfSpeech::Unknown, nil],
  'MD' => [Ve::PartOfSpeech::Verb, :modal],
  'NN' => [Ve::PartOfSpeech::Noun, nil],
  'NNS' => [Ve::PartOfSpeech::Noun, :plural],
  'NNP' => [Ve::PartOfSpeech::ProperNoun, nil],
  'NNPS' => [Ve::PartOfSpeech::ProperNoun, :plural],
  'PDT' => [Ve::PartOfSpeech::Determiner, nil],
  'PRP' => [Ve::PartOfSpeech::Pronoun, :personal],
  'PRP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
  'RB' => [Ve::PartOfSpeech::Adverb, nil],
  'RBR' => [Ve::PartOfSpeech::Adverb, :comparative],
  'RBS' => [Ve::PartOfSpeech::Adverb, :superlative],
  'RP' => [Ve::PartOfSpeech::Postposition, nil],
  'SYM' => [Ve::PartOfSpeech::Symbol, nil],
  'TO' => [Ve::PartOfSpeech::Preposition, nil],
  'UH' => [Ve::PartOfSpeech::Interjection, nil],
  'VB' => [Ve::PartOfSpeech::Verb, nil],
  'VBD' => [Ve::PartOfSpeech::Verb, :past],
  'VBG' => [Ve::PartOfSpeech::Verb, :present_participle],
  'VBN' => [Ve::PartOfSpeech::Verb, :past_participle],
  'VBP' => [Ve::PartOfSpeech::Verb, nil],
  'VBZ' => [Ve::PartOfSpeech::Verb, nil],
  'WDT' => [Ve::PartOfSpeech::Determiner, nil],
  'WP' => [Ve::PartOfSpeech::Pronoun, nil],
  'WP$' => [Ve::PartOfSpeech::Pronoun, :possessive],
  'WRB' => [Ve::PartOfSpeech::Adverb, nil],
  'Z' => [Ve::PartOfSpeech::Determiner, nil]
}

Instance Attribute Summary collapse

#text ⇒ Object readonly

Returns the value of attribute text.
#tokens ⇒ Object readonly

Returns the value of attribute tokens.

Instance Method Summary collapse

#initialize(text, output) ⇒ FreelingEn constructor

A new instance of FreelingEn.
#sentences ⇒ Object
#words ⇒ Object

Methods inherited from Ve::Parse

#as_json

Constructor Details

#initialize(text, output) ⇒ `FreelingEn`

Returns a new instance of FreelingEn.

# File 'lib/providers/freeling_en.rb', line 78

def initialize(text, output)
  @tokens = []
  @text = text
  position = 0
  
  output.each_with_index do |line, index|
    line.rstrip!
    token = {:raw => line}

    # Anything unparsed at the end of the text
    # This must happen before sentence splits are detected to avoid funny ordering
    if output.length > 1 && output.length == index + 1
      unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed,
                          :literal => unparsed_md[1],
                          :raw => ''}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
      end
    end
      
    # Sentence splits are just empty lines in Freeling
    if line.length == 0
      token[:type] = :sentence_split
      token[:literal] = ''
      @tokens << token
      next
    end
    
    # The parsed token
    info = line.split(/\s+/)
    token[:type] = :parsed
    [:literal, :lemma, :pos, :accuracy].each_with_index do |attr, i|
      token[attr] = info[i]
    end

    token[:literal].gsub!('_', ' ')
    token[:lemma].gsub!('_', ' ')
    
    # Anything unparsed preceding this token.
    # We need to do this complicated dance with _ since Freeling replaces spaces with it.
    # And so we need to be able to find the token with both spaces and _ in it since
    # we don't know what the original in the text actually is.
    # Once we have the location in the text we can figure out if it should be with spaces or _.
    unparsed_re = %r{(.*?) #{Regexp.quote(token[:literal])}}mx
    unparsed_re = %r{#{unparsed_re.to_s.gsub('_', '[\s_]')}}
    unparsed_md = unparsed_re.match(text, position)
    if unparsed_md && unparsed_md[1].length > 0
      unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
      unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
      @tokens << unparsed_token
      position += unparsed_token[:literal].length
    end

    token[:characters] = (position..(position+token[:literal].length-1))
    position += token[:literal].length
    @tokens << token
  end
end

Instance Attribute Details

#text ⇒ `Object` (readonly)

Returns the value of attribute text.



76
77
78

# File 'lib/providers/freeling_en.rb', line 76

def text
  @text
end

#tokens ⇒ `Object` (readonly)

Returns the value of attribute tokens.



76
77
78

# File 'lib/providers/freeling_en.rb', line 76

def tokens
  @tokens
end

Instance Method Details

#sentences ⇒ `Object`

# File 'lib/providers/freeling_en.rb', line 204

def sentences
  sentences = []
  current = ''
  
  @tokens.each do |token|
    if token[:type] == :sentence_split
      sentences << current
      current = ''
    else
      current << token[:literal]
    end
  end
  
  # In case there is no :sentence_split at the end
  sentences << current if current.length > 0

  sentences.collect { |s| s.strip! }
  sentences
end

#words ⇒ `Object`

# File 'lib/providers/freeling_en.rb', line 178

def words
  words = []
  
  @tokens.find_all { |t| t[:type] == :parsed }.each do |token|
    if token[:pos] == 'POS'
      # Possessive ending, add to previous token
      words[-1].word << token[:literal]
      words[-1].tokens << token
      next
    else
      # All other tokens
      pos, grammar = INTERNAL_INFO_FOR_PARSED_POS[token[:pos]]

      if pos.nil? && token[:pos] =~ /^F\w+$/
        pos = Ve::PartOfSpeech::Symbol
      end

      pos = Ve::PartOfSpeech::TBD if pos.nil?
      word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {:grammar => grammar})
      words << word
    end
  end
  
  words
end

Class: Ve::Parse::FreelingEn

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Ve::Parse

Constructor Details

#initialize(text, output) ⇒ FreelingEn

Instance Attribute Details

#text ⇒ Object (readonly)

#tokens ⇒ Object (readonly)

Instance Method Details

#sentences ⇒ Object

#words ⇒ Object

#initialize(text, output) ⇒ `FreelingEn`

#text ⇒ `Object` (readonly)

#tokens ⇒ `Object` (readonly)

#sentences ⇒ `Object`

#words ⇒ `Object`