Class: Corenlp::Token

Inherits:
Object
  • Object
show all
Defined in:
lib/corenlp/token.rb

Direct Known Subclasses

Enclitic, Number, Punctuation, Word

Constant Summary collapse

IGNORED_ENTITIES =
["PERSON"]
Enclitics =
%w{'ll 'm 're 's 't 've 'nt n't 'd ’ll ’m ’re ’s ’t ’ve ’nt n’t ’d}
WordRegexp =
/^[[:alpha:]\-'\/]+$/
NumberRegexp =
/^#?(\d+)(,\d+)*(\.\d+)?$/
PunctRegexp =
/^[[:punct:]'"\$]+$/
WebsiteRegexp =
/https?:\/\/[\S]+/
STANFORD_TEXT_REPLACEMENTS =

The character replacements that Stanford performs which we reverse:

{
  '' => "''",    '' => '``',    '(' => '-LRB-',
  ')' => '-RRB-', '[' => '-LSB-', ']' => '-RSB-',
  '{' => '-LCB-', '}' => '-RCB-',
  '' => '`', '' => '\'', '' => '--', '/' => '\\/'
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(attrs = {}) ⇒ Token

Returns a new instance of Token.



5
6
7
8
9
10
11
12
# File 'lib/corenlp/token.rb', line 5

def initialize(attrs = {})
  @index = attrs[:index]
  @text = attrs[:text]
  @penn_treebank_tag = attrs[:penn_treebank_tag]
  @stanford_lemma = attrs[:stanford_lemma]
  @type = attrs[:type]
  @ner = attrs[:ner]
end

Instance Attribute Details

#indexObject

Returns the value of attribute index.



3
4
5
# File 'lib/corenlp/token.rb', line 3

def index
  @index
end

#nerObject

Returns the value of attribute ner.



3
4
5
# File 'lib/corenlp/token.rb', line 3

def ner
  @ner
end

#penn_treebank_tagObject

Returns the value of attribute penn_treebank_tag.



3
4
5
# File 'lib/corenlp/token.rb', line 3

def penn_treebank_tag
  @penn_treebank_tag
end

#stanford_lemmaObject

Returns the value of attribute stanford_lemma.



3
4
5
# File 'lib/corenlp/token.rb', line 3

def stanford_lemma
  @stanford_lemma
end

#textObject

Returns the value of attribute text.



3
4
5
# File 'lib/corenlp/token.rb', line 3

def text
  @text
end

#typeObject

Returns the value of attribute type.



3
4
5
# File 'lib/corenlp/token.rb', line 3

def type
  @type
end

Class Method Details

.clean_stanford_text(text) ⇒ Object



33
34
35
36
37
38
# File 'lib/corenlp/token.rb', line 33

def self.clean_stanford_text(text)
  Token::STANFORD_TEXT_REPLACEMENTS.each_pair do |original, replacement|
    text.gsub!(replacement, original)
  end
  text
end

.token_subclass_from_text(text) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/corenlp/token.rb', line 58

def self.token_subclass_from_text(text)
  case
  when Enclitics.include?(text)
    Enclitic
  when (text =~ WordRegexp && text != '-') || (text =~ WebsiteRegexp)
    Word
  when text =~ PunctRegexp
    Punctuation
  when text =~ NumberRegexp
    Number
  else
    Token
  end
end

Instance Method Details

#==(other) ⇒ Object



24
25
26
27
# File 'lib/corenlp/token.rb', line 24

def ==(other)
  index == other.index && \
    penn_treebank_tag == other.penn_treebank_tag && type == other.type
end

#content?Boolean

Returns:

  • (Boolean)


16
17
18
# File 'lib/corenlp/token.rb', line 16

def content?
  is_a?(Word) || is_a?(Enclitic)
end

#ignored_entity?Boolean

Returns:

  • (Boolean)


54
55
56
# File 'lib/corenlp/token.rb', line 54

def ignored_entity?
  IGNORED_ENTITIES.include?(self.ner)
end

#top_level_penn_treebank_categoryObject



20
21
22
# File 'lib/corenlp/token.rb', line 20

def top_level_penn_treebank_category
  penn_treebank_tag[0]
end

#website_text?Boolean

Returns:

  • (Boolean)


29
30
31
# File 'lib/corenlp/token.rb', line 29

def website_text?
  text =~ /http:\/\//
end