Class: Punkt::Trainer

Inherits:
Base
  • Object
show all
Defined in:
lib/punkt-segmenter/punkt/trainer.rb

Constant Summary collapse

ABBREV =

cut-off value whether a ‘token’ is an abbreviation

0.3
IGNORE_ABBREV_PENALTY =

allows the disabling of the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period.

false
ABBREV_BACKOFF =

upper cut-off for Mikheev’s(2002) abbreviation detection algorithm

5
COLLOCATION =

minimal log-likelihood value that two tokens need to be considered as a collocation

7.88
SENT_STARTER =

minimal log-likelihood value that a token requires to be considered as a frequent sentence starter

30
INCLUDE_ALL_COLLOCS =

this includes as potential collocations all word pairs where the first word ends in a period. It may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify.

true
INCLUDE_ABBREV_COLLOCS =

this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.

false
MIN_COLLOC_FREQ =

this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.

1

Instance Method Summary collapse

Methods inherited from Base

#tokenize_words

Constructor Details

#initialize(language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token) ⇒ Trainer

Returns a new instance of Trainer.



42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/punkt-segmenter/punkt/trainer.rb', line 42

def initialize(language_vars = Punkt::LanguageVars.new, 
               token_class   = Punkt::Token)
               
  super(language_vars, token_class)
  
  @type_fdist             = Probability::FrequencyDistribution.new
  @collocation_fdist      = Probability::FrequencyDistribution.new
  @sentence_starter_fdist = Probability::FrequencyDistribution.new
  @period_tokens_count    = 0
  @sentence_break_count   = 0
  @finalized              = false      
end

Instance Method Details

#finalize_trainingObject



69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/punkt-segmenter/punkt/trainer.rb', line 69

def finalize_training
  @parameters.clear_sentence_starters 
  find_sentence_starters do |type, ll|
    @parameters.sentence_starters << type
  end
  
  @parameters.clear_collocations
  find_collocations do |types, ll|
    @parameters.collocations << [types[0], types[1]]
  end

  @finalized = true
end

#parametersObject



64
65
66
67
# File 'lib/punkt-segmenter/punkt/trainer.rb', line 64

def parameters
  finalize_training unless @finalized
  return @parameters
end

#train(text_or_tokens) ⇒ Object



55
56
57
58
59
60
61
62
# File 'lib/punkt-segmenter/punkt/trainer.rb', line 55

def train(text_or_tokens)
  if text_or_tokens.kind_of?(String)
    tokens = tokenize_words(text_or_tokens) 
  elsif text_or_tokens.kind_of?(Array)
    tokens = text_or_tokens.map { |t| @token_class.new(t) }
  end
  train_tokens(tokens)
end