Class: Corenlp::Token

Inherits:

Object

Object
Corenlp::Token

show all

Defined in:: lib/corenlp/token.rb

Direct Known Subclasses

Enclitic, Number, Punctuation, Word

Constant Summary collapse

IGNORED_ENTITIES =

["PERSON"]

Enclitics =

%w{'ll 'm 're 's 't 've 'nt n't 'd ’ll ’m ’re ’s ’t ’ve ’nt n’t ’d}

WordRegexp =

/^[[:alpha:]\-'\/]+$/

NumberRegexp =

/^#?(\d+)(,\d+)*(\.\d+)?$/

PunctRegexp =

/^[[:punct:]'"\$]+$/

WebsiteRegexp =

/https?:\/\/[\S]+/

STANFORD_TEXT_REPLACEMENTS = The character replacements that Stanford performs which we reverse:

{
  '”' => "''",    '“' => '``',    '(' => '-LRB-',
  ')' => '-RRB-', '[' => '-LSB-', ']' => '-RSB-',
  '{' => '-LCB-', '}' => '-RCB-',
  '‘' => '`', '’' => '\'', '—' => '--', '/' => '\\/'
}

Instance Attribute Summary collapse

#index ⇒ Object

Returns the value of attribute index.
#ner ⇒ Object

Returns the value of attribute ner.
#penn_treebank_tag ⇒ Object

Returns the value of attribute penn_treebank_tag.
#stanford_lemma ⇒ Object

Returns the value of attribute stanford_lemma.
#text ⇒ Object

Returns the value of attribute text.
#type ⇒ Object

Returns the value of attribute type.

Class Method Summary collapse

Instance Method Summary collapse

#==(other) ⇒ Object
#content? ⇒ Boolean
#ignored_entity? ⇒ Boolean
#initialize(attrs = {}) ⇒ Token constructor

A new instance of Token.
#top_level_penn_treebank_category ⇒ Object
#website_text? ⇒ Boolean

Constructor Details

#initialize(attrs = {}) ⇒ `Token`

Returns a new instance of Token.

# File 'lib/corenlp/token.rb', line 5

def initialize(attrs = {})
  @index = attrs[:index]
  @text = attrs[:text]
  @penn_treebank_tag = attrs[:penn_treebank_tag]
  @stanford_lemma = attrs[:stanford_lemma]
  @type = attrs[:type]
  @ner = attrs[:ner]
end

Instance Attribute Details

#index ⇒ `Object`

Returns the value of attribute index.



3
4
5

# File 'lib/corenlp/token.rb', line 3

def index
  @index
end

#ner ⇒ `Object`

Returns the value of attribute ner.



3
4
5

# File 'lib/corenlp/token.rb', line 3

def ner
  @ner
end

#penn_treebank_tag ⇒ `Object`

Returns the value of attribute penn_treebank_tag.



3
4
5

# File 'lib/corenlp/token.rb', line 3

def penn_treebank_tag
  @penn_treebank_tag
end

#stanford_lemma ⇒ `Object`

Returns the value of attribute stanford_lemma.



3
4
5

# File 'lib/corenlp/token.rb', line 3

def stanford_lemma
  @stanford_lemma
end

#text ⇒ `Object`

Returns the value of attribute text.



3
4
5

# File 'lib/corenlp/token.rb', line 3

def text
  @text
end

#type ⇒ `Object`

Returns the value of attribute type.



3
4
5

# File 'lib/corenlp/token.rb', line 3

def type
  @type
end

Class Method Details

.clean_stanford_text(text) ⇒ `Object`

# File 'lib/corenlp/token.rb', line 33

def self.clean_stanford_text(text)
  Token::STANFORD_TEXT_REPLACEMENTS.each_pair do |original, replacement|
    text.gsub!(replacement, original)
  end
  text
end

.token_subclass_from_text(text) ⇒ `Object`

# File 'lib/corenlp/token.rb', line 58

def self.token_subclass_from_text(text)
  case
  when Enclitics.include?(text)
    Enclitic
  when (text =~ WordRegexp && text != '-') || (text =~ WebsiteRegexp)
    Word
  when text =~ PunctRegexp
    Punctuation
  when text =~ NumberRegexp
    Number
  else
    Token
  end
end

Instance Method Details

#==(other) ⇒ `Object`

# File 'lib/corenlp/token.rb', line 24

def ==(other)
  index == other.index && \
    penn_treebank_tag == other.penn_treebank_tag && type == other.type
end

#content? ⇒ `Boolean`

Returns:

(Boolean)



16
17
18

# File 'lib/corenlp/token.rb', line 16

def content?
  is_a?(Word) || is_a?(Enclitic)
end

#ignored_entity? ⇒ `Boolean`

Returns:

(Boolean)



54
55
56

# File 'lib/corenlp/token.rb', line 54

def ignored_entity?
  IGNORED_ENTITIES.include?(self.ner)
end

#top_level_penn_treebank_category ⇒ `Object`



20
21
22

# File 'lib/corenlp/token.rb', line 20

def top_level_penn_treebank_category
  penn_treebank_tag[0]
end

#website_text? ⇒ `Boolean`

Returns:

(Boolean)



29
30
31

# File 'lib/corenlp/token.rb', line 29

def website_text?
  text =~ /http:\/\//
end

Class: Corenlp::Token

Direct Known Subclasses

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(attrs = {}) ⇒ Token

Instance Attribute Details

#index ⇒ Object

#ner ⇒ Object

#penn_treebank_tag ⇒ Object

#stanford_lemma ⇒ Object

#text ⇒ Object

#type ⇒ Object

Class Method Details

.clean_stanford_text(text) ⇒ Object

.token_subclass_from_text(text) ⇒ Object

Instance Method Details

#==(other) ⇒ Object

#content? ⇒ Boolean

#ignored_entity? ⇒ Boolean

#top_level_penn_treebank_category ⇒ Object

#website_text? ⇒ Boolean

#initialize(attrs = {}) ⇒ `Token`

#index ⇒ `Object`

#ner ⇒ `Object`

#penn_treebank_tag ⇒ `Object`

#stanford_lemma ⇒ `Object`

#text ⇒ `Object`

#type ⇒ `Object`

.clean_stanford_text(text) ⇒ `Object`

.token_subclass_from_text(text) ⇒ `Object`

#==(other) ⇒ `Object`

#content? ⇒ `Boolean`

#ignored_entity? ⇒ `Boolean`

#top_level_penn_treebank_category ⇒ `Object`

#website_text? ⇒ `Boolean`