Class: PROIEL::Converter::CoNLLU::Sentence

Inherits:
Object
  • Object
show all
Defined in:
lib/proiel/cli/converters/conll-u.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sentence) ⇒ Sentence

initializes a PROIEL::Convert::Sentence from PROIEL::PROIELXML::Sentence


48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/proiel/cli/converters/conll-u.rb', line 48

def initialize(sentence)

  id_to_number = Hash.new(0) #will return id 0 (i.e. root) for nil

  # initialize array to hold the sentence tokens
  tks = []
  # keep track of how many new tokens have been created
  offset = 0
  
  sentence.tokens.reject { |t| t.empty_token_sort == 'P' }.each do |tk|

    if tk.form =~ /[[:space:]]/
      subtoks = tk.form.split(/[[:space:]]/)
      
      subtoks.each_with_index do |subtok, i|
        tks << PROIEL::Token.new(sentence,
                         (i == 0 ? tk.id : 1000 + offset), # id
                         (i == 0 ? tk.head_id : tk.id), # head_id
                         subtok,
                         # hope the lemmas split the same way as the tokens. Grab the form is you don't find a lemma
                         (tk.lemma.split(/[[:space:]]/)[i] || subtok), 
                         tk.part_of_speech, # copy the postag
                         tk.morphology,
                         (i == 0 ? tk.relation : "flat"),
                         nil, #empty_token_sort
                         tk.citation_part,
                         (i == 0 ? tk.presentation_before : nil),
                         (i == (subtoks.size - 1)  ? tk.presentation_after : nil), 
                         (i == 0 ? tk.antecedent_id : nil),
                         (i == 0 ? tk.information_status : nil),
                         (i == 0 ? tk.contrast_group : nil),
                         (i == 0 ? tk.foreign_ids : nil),
                         (i == 0 ? tk.slashes.map { |rel, target| PROIEL::PROIELXML::Reader::Slash.new({:'target_id' => target, :relation => rel} ) } : []), #  This needs to be given a real slash object for the initialization, although it throws away the info
                         (subtok == subtoks.first ? tk.alignment_id : nil)
                        )
        offset += 1
      end
    else
      tks << tk
    end
  end

  
  tks.map(&:id).each_with_index.each do |id, i|
    id_to_number[id] = i + 1
  end

  @tokens = tks.map do |t|

    Token.new(id_to_number[t.id],
              id_to_number[t.head_id],
              #insert dots in any whitespace inside words and lemmata
              t.form.to_s.gsub(/[[:space:]]/, '.'), 
              t.lemma.to_s.gsub(/[[:space:]]/, '.'),
              t.part_of_speech,
              t.language,
              t.morphology,
              t.relation,
              t.empty_token_sort,
              t.slashes.map { |relation, target_id| [id_to_number[target_id], relation] },
              t.citation_part,
              self
             )
  end
end

Instance Attribute Details

#tokensObject

Returns the value of attribute tokens.


45
46
47
# File 'lib/proiel/cli/converters/conll-u.rb', line 45

def tokens
  @tokens
end

Instance Method Details

#convertObject


114
115
116
117
118
119
# File 'lib/proiel/cli/converters/conll-u.rb', line 114

def convert
  restructure_graph!
  relabel_graph!
  map_part_of_speech!
  self
end

#count_tokensObject


133
134
135
# File 'lib/proiel/cli/converters/conll-u.rb', line 133

def count_tokens
  roots.map(&:count_subgraph).inject(0, :+)
end

#demote_parentheticals_and_vocatives!Object


168
169
170
171
172
173
174
175
176
177
# File 'lib/proiel/cli/converters/conll-u.rb', line 168

def demote_parentheticals_and_vocatives!
  r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  if p.any? and r.none?
    # promote the first vocative/parenthetical to head in case there's nothing else
    p.first.relation = 'pred'
    r, p = roots.partition { |n| !['voc', 'parpred'].include? n.relation }
  end
  raise "No unique root in this tree:\n#{to_graph}" if p.any? and !r.one?
  p.each { |x| x.head_id = r.first.id }
end

#demote_subjunctions!Object


164
165
166
# File 'lib/proiel/cli/converters/conll-u.rb', line 164

def demote_subjunctions!
  @tokens.select { |t| t.part_of_speech == 'G-' }.each(&:process_subjunction!)
end

#find_token(identifier) ⇒ Object


121
122
123
# File 'lib/proiel/cli/converters/conll-u.rb', line 121

def find_token(identifier)
  @tokens.select { |t| t.id == identifier }.first
end

#map_part_of_speech!Object


183
184
185
# File 'lib/proiel/cli/converters/conll-u.rb', line 183

def map_part_of_speech!
  roots.each(&:map_part_of_speech!)
end

#prune_empty_rootnodes!Object

TODO: this will leave several root nodes in many cases. For now, raise an error


150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/proiel/cli/converters/conll-u.rb', line 150

def prune_empty_rootnodes!
  unless (empty_roots = roots.select { |r| r.empty_token_sort == 'V' }).empty?
    empty_roots.each do |r|
      # promote the first dependent to root
      new_root = r.dependents.first
      new_root.head_id = 0
      new_root.relation = r.relation
      r.dependents.each { |d| d.head_id = new_root.id }
      remove_token! r
    end
    prune_empty_rootnodes!
  end
end

#relabel_graph!Object


179
180
181
# File 'lib/proiel/cli/converters/conll-u.rb', line 179

def relabel_graph!
  roots.each(&:relabel_graph!)
end

#remove_token!(token) ⇒ Object


125
126
127
# File 'lib/proiel/cli/converters/conll-u.rb', line 125

def remove_token!(token)
  @tokens.delete(token)
end

#restructure_graph!Object


187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# File 'lib/proiel/cli/converters/conll-u.rb', line 187

def restructure_graph!
  @tokens.delete_if { |n| n.empty_token_sort == 'P' }
  @tokens.select(&:preposition?).each(&:process_preposition!)
  roots.each(&:change_coordinations!)
  @tokens.select(&:copula?).each(&:process_copula!)
  prune_empty_rootnodes!
  # do ellipses from left to right for proper remnant treatment
  @tokens.select(&:ellipsis?).sort_by { |e| e.left_corner.id }.each(&:process_ellipsis!)
  demote_subjunctions!
  # DIRTY: remove the rest of the empty nodes by attaching them
  # to their grandmother with remnant. This is the best way to
  # do it given the current state of the UDEP scheme, but
  # revisions will come.
  roots.each(&:remove_empties!)
  demote_parentheticals_and_vocatives!
end

#rootsObject


137
138
139
# File 'lib/proiel/cli/converters/conll-u.rb', line 137

def roots
  @tokens.select { |t| t.head_id == 0 }.sort_by(&:id)
end

#to_conllObject


145
146
147
# File 'lib/proiel/cli/converters/conll-u.rb', line 145

def to_conll
  @tokens.map(&:to_conll).join("\n")
end

#to_graphObject


141
142
143
# File 'lib/proiel/cli/converters/conll-u.rb', line 141

def to_graph
  roots.map(&:to_graph).join("\n")
end

#to_sObject


129
130
131
# File 'lib/proiel/cli/converters/conll-u.rb', line 129

def to_s
  @tokens.map(&:to_s).join("\n")
end