Class: PROIEL::Converter::CoNLLU
- Inherits:
-
Object
- Object
- PROIEL::Converter::CoNLLU
- Defined in:
- lib/proiel/cli/converters/conll-u.rb,
lib/proiel/cli/converters/conll-u/syntax.rb,
lib/proiel/cli/converters/conll-u/morphology.rb
Defined Under Namespace
Constant Summary collapse
- OBLIQUENESS_HIERARCHY =
["nsubj", "obj", "iobj", "obl", "advmod", "csubj", "xcomp", "ccomp", "advcl"]
- RELATION_MAPPING =
{ "adnom" => "dep", "adv" => [["advcl", lambda(&:clausal?) ], ["advmod", lambda { |x| x.adverb? or x.preposition? } ], ["advmod", lambda(&:adjectival?) ], # adjective for adverb ["obl", lambda(&:nominal?) ], ["advmod", lambda { |x| true } ], ], "ag" => "obl:agent", # add :agent" once defined "apos" => [["flat:name", lambda { |x| x.proper_noun? and x.head and x.head.proper_noun? } ], ["appos", lambda { |x| (x.nominal? or x.adjectival?) and x.head and x.head.nominal? } ], ["acl", lambda { |x| x.clausal? and x.head and x.head.nominal? } ], # add :relcl ? # what to do about sentential appositions? ["advcl", lambda(&:clausal?) ], ["appos", lambda { |x| true } ], ], "arg" => "dep", "atr" => [["nummod", lambda(&:cardinal?) ], ["det", lambda { |x| x.pronominal? and !(!x.genitive? and x.head and x.head.genitive?) } ], #TODO check ["nmod", lambda(&:nominal?) ], ["acl", lambda { |x| x.clausal? } ], # add :relcl? ["advmod", lambda { |x| x.head and x.head.clausal? } ], ["amod", lambda { |x| true } ], #default ], "aux" => [["det", lambda(&:determiner?) ], ["aux:pass", lambda { |x| x.clausal? and x.head.passive? } ], ["aux", lambda(&:clausal?) ], #v2 probably want the modal particle an to go here too in ["advmod", lambda(&:negation?) ], ["discourse", lambda { |x| x.particle? or x.interjection? } ], # include subjunctions that are aux here; (root sentences with subjunction) ["advmod", lambda { |x| x.adjectival? or x.adverb? or x.subjunction? } ], ["cc", lambda(&:conjunction?) ], ["flat:foreign", lambda(&:foreign?) ], # We need some more distinctions to get Gothic and Armenian. Introduce language in the treebank? (Read from xml) ["mark", lambda { |x| ['R-'].include? x.part_of_speech } ], #'R-' as infinitive marker in Gothic ["aux", lambda { |x| ['Pk' ].include? x.part_of_speech } ], #reflexive as valency reducer ['amod', lambda { |x| x.preposition? } ], # Armenian DOM ['fixed', lambda { |x| ['Px', 'Pr'].include? x.part_of_speech } ], # NB there are a lot of bogus annotations with 'Px' # MISANNOTATION IF A NOUN or a 'Pi' or a 'Pp' or a 'Ps' ], "comp" => [['csubj:pass', lambda { |x| x.head and x.head.passive? } ], ['csubj', lambda { |x| x.head and x.head.copula? } ], ['ccomp', lambda { |x| true } ], ], "expl" => "expl", "narg" => [['acl', lambda(&:clausal?) ], ['nmod', lambda(&:nominal?) ], ['nmod', lambda(&:adjectival?) ], # nominaliezed in this function ['nmod', lambda { |x| true } ], ], "nonsub" => "dep", "obj" => "obj:dir", "obl" => [# normally a preposition will be subordinate to its noun, this captures adverbial use of prepositions ["advmod", lambda { |x| x.adverb? or x.preposition? } ], ["obl", lambda { |x| x.has_preposition? } ], ["iobj", lambda(&:nominal?) ],# if nominal (NB check for presence of article!) TODO: should be "obj" if the verb is monovalent (even by elision) ["iobj", lambda(&:adjectival?) ], # OBL adjectives are nominalized ["advcl", lambda(&:clausal?) ], # this seems to happen with ad libros legendos etc. but check closer! ["iobj", lambda { |x| true } ], ], "parpred" => "parataxis", "part" => "nmod", "per" => "dep", "pid" => ["ERROR", lambda { |x| raise "Remaining pid edge!" } ], "pred" => [["root", lambda(&:root?) ], ["ERROR", lambda { |x| raise "#{x.to_n} (head_id #{x.head_id}) is not a root!" }], ], "rel" => "acl", # add :relcl? "sub" => [["nsubj:pass", lambda { |x| x.head and x.head.passive? } ], ["nsubj", lambda { |x| true }], ], "voc" => "vocative", "xadv" => [["advcl", lambda(&:clausal?)], #add :contr ? ["advmod", lambda { |x| true } ], # add :contr ? ], "xobj" => "xcomp", # copula cases have already been taken care of "xsub" => "xsub", }
- DEPONENTS =
try to guess deponency based on the lemma
{ 'lat' => /r\Z/, 'grc' => /ομαι\Z/ }
- COPULAR_LEMMATA =
['sum,V-,lat', 'εἰμί#1,V-,grc']
- AUXILIARIES =
COPULAR_LEMMATA + []
- DETERMINERS =
['S-', 'Pd', 'Px']
- NEGATION_LEMMATA =
['non,Df,lat', 'ne,Df,lat', 'μή,Df,grc', 'μήγε,Df,grc', 'μηδαμῶς,Df,grc', 'μηδέποτε,Df,grc', 'μηδέπω,Df,grc', 'μηκέτι,Df,grc', 'μήπω,Df,grc', 'μήτε,Df,grc', 'μήτι,Df,grc', 'μήτιγε,Df,grc', 'οὐ,Df,grc', 'οὐδαμῇ,Df,grc', 'οὐδαμῶς,Df,grc', 'οὐδέ,Df,grc', 'οὐδέποτε,Df,grc', 'οὐδέπω,Df,grc', 'οὐκέτι,Df,grc', 'οὐκοῦν,Df,grc', 'οὔπω,Df,grc', 'οὔτε,Df,grc', 'οὔτι,Df,grc', 'οὐχί,Df,grc', 'не,Df,chu', 'ни,Df,chu', 'нѣ,Df,chu', 'nei,Df,got', 'ni,Df,got', 'nibai#2,Df,got', 'nih,Df,got', ]
- TAM_PARTICLE_LEMMATA =
['ἄν,Df,grc', ]
- PARTICLE_LEMMATA =
[ 'at,Df,lat', 'atque,Df,lat', 'autem,Df,lat', 'certe,Df,lat', 'ergo,Df,lat', 'et,Df,lat', 'enim,Df,lat', 'etiam,Df,lat', 'igitur,Df,lat', 'immo,Df,lat', 'itaque,Df,lat', 'nam,Df,lat', 'nonne,Df,lat', 'nonne,Du,lat', 'quidem,Df,lat', 'quoque,Df,lat', 'sic,Df,lat', 'tamen,Df,lat', 'tum,Df,lat', 'tunc,Df,lat', 'vero,Df,lat', 'ἅμα,Df,grc', 'ἀνά,Df,grc', 'ἆρα,Df,grc', 'ἄραγε,Df,grc', 'ἀτάρ,Df,grc', 'ἅτε,Df,grc', 'αὗ,Df,grc', 'αὖθις,Df,grc', 'γάρ,Df,grc', 'γε,Df,grc', 'γοῦν,Df,grc', 'δέ,Df,grc', 'δή,Df,grc', 'δῆθεν,Df,grc', 'δηλαδή,Df,grc', 'δηλονότι,Df,grc', 'δῆτα,Df,grc', 'εἶτα,Df,grc', 'ἔτι,Df,grc', 'ἦ#2,Df,grc', 'ἤγουν,Df,grc', 'ἤδη,Df,grc', 'ἤτοι,Df,grc', 'καίτοι,Df,grc', 'καίτοιγε,Df,grc', 'μέν,Df,grc', 'μενοῦνγε,Df,grc', 'μέντοι,Df,grc', 'μήν,Df,grc', 'νά,Df,grc', 'νῦν#1,Df,grc', 'νυν#2,Df,grc', 'νυνί,Df,grc', 'οὖν,Df,grc', 'πέρ,Df,grc', 'πῃ,Df,grc', 'ποτε,Df,grc', 'πού,Df,grc', 'πω,Df,grc', 'πως,Df,grc', 'τάχα,Df,grc', 'τε,Df,grc', 'τοι,Df,grc', 'τοιγαροῦν,Df,grc', 'τοίνυν,Df,grc', 'бо,Df,chu', 'же,Df,chu', 'занѥ,Df,chu', 'ибо,Df,chu', 'иде,Df,chu', 'ижде,Df,chu', 'ли,Df,chu', 'обаче,Df,chu', 'оубо,Df,chu', 'ти,Df,chu', 'тѣ,Df,chu', 'ꙗко#2,Df,chu', 'an,Df,got', 'auk,Df,got', 'aufto,Df,got', 'nu,Df,got', 'ussindo,Df,got', 'waitei,Df,got', 'þan,Df,got', 'nuh,Df,got', 'nunu,Df,got', 'raihtis,Df,got', 'sunsaiw,Df,got', 'unte,Df,got', 'þande,Df,got', 'þannu,Df,got', 'þanuh,Df,got', 'þaruh,Df,got', ]
- POS_MAP =
{ 'A-' => [['ADJ', lambda { |x| true } ]], 'C-' => [['CCONJ', lambda { |x| true } ]], 'Df' => [['AUX', lambda(&:TAM_particle?)], ['ADV', lambda(&:negation?), "Polarity=Neg"], ['ADV', lambda { |x| true } ] ], 'Dq' => [['ADV', lambda { |x| true }, "PronType=Rel"]], 'Du' => [['ADV', lambda { |x| true }, "PronType=Int"]], 'F-' => [['X', lambda { |x| true } ]], 'G-' => [['SCONJ', lambda { |x| true } ]], 'I-' => [['INTJ', lambda { |x| true } ]], 'Ma' => [['NUM', lambda { |x| true } ]], 'Mo' => [['ADJ', lambda { |x| true } ]], 'N-' => [['SCONJ', lambda { |x| true } ]], #irrelevant for our purposes 'Nb' => [['NOUN', lambda { |x| true } ]], 'Ne' => [['PROPN', lambda { |x| true } ]], 'Pc' => [['PRON', lambda { |x| true }, "PronType=Rcp"]], 'Pd' => [['DET', lambda { |x| true } ]], 'Pi' => [['PRON', lambda { |x| true }, "PronType=Int"]], 'Pk' => [['AUX', lambda { |x| x.relation == 'aux' }], ['PRON', lambda { |x| true }, "PronType=Prs|Reflex=Yes"]], 'Pp' => [['PRON', lambda { |x| true }, "PronType=Prs"]], 'Pr' => [['PRON', lambda { |x| true }, "PronType=Rel"]], 'Ps' => [['ADJ', lambda { |x| true }, "Poss=Yes"]], ### NB no evidence for a pronominal/determiner-like nature here 'Pt' => [['ADJ', lambda { |x| true }, "Poss=Yes|Reflex=Yes" ]], ### NB no evidence for a pronominal/determiner-like nature here 'Px' => [['DET', lambda { |x| true } ]], 'Py' => [['PRON', lambda { |x| true } ]], 'R-' => [['ADP', lambda { |x| true } ]], 'V-' => [['AUX', lambda(&:auxiliary?)], ['VERB', lambda { |x| true } ]], 'S-' => [['DET', lambda { |x| true }, "Definite=Def|PronType=Dem"]], # (we only have definite articles) 'X-' => [['X', lambda { |x| true } ]] }
- MORPHOLOGY_MAP =
{ :person => {'1' => 'Person=1', '2' => 'Person=2', '3' => 'Person=3' } , :number => {'s' => 'Number=Sing', 'd' => 'Number=Dual', 'p' => 'Number=Plur' } , :tense => {'p' => 'Tense=Pres', 'i' => 'Tense=Past|Aspect=Imp', 'r' => 'Tense=Past|Aspect=Perf', #'Tense=Perfect', 's' => 'Aspect=Res', # tags Perf is not universal 'a' => 'Tense=Past|Aspect=Perf', 'u' => 'Tense=Past', 'l' => 'Tense=Pqp', 'f' => 'Tense=Fut', # tag FutPerfect is not universal 't' => 'Tense=Fut|Aspect=Perf', #FutPerfect' }, :mood => {'i' => 'VerbForm=Fin|Mood=Ind', 's' => 'VerbForm=Fin|Mood=Sub', 'm' => 'VerbForm=Fin|Mood=Imp', 'o' => 'VerbForm=Fin|Mood=Opt', 'n' => 'VerbForm=Inf', 'p' => 'VerbForm=Part', 'd' => 'VerbForm=Ger', # Gdv (gerundive) is not universal 'g' => 'VerbForm=Gdv', 'u' => 'VerbForm=Sup', 'e'=> 'VerbForm=Fin|Mood=Ind,Sub', 'f'=> 'VerbForm=Fin|Mood=Imp,Ind', 'h'=> 'VerbForm=Fin|Mood=Imp,Sub', 't' => 'VerbForm=Fin' }, :voice => {'a' => 'Voice=Act', # Med is not universal 'm' => 'Voice=Mid', 'p' => 'Voice=Pass', 'e' => 'Voice=Mid,Pass' }, :gender => {'m' => 'Gender=Masc', 'f' => 'Gender=Fem', 'n' => 'Gender=Neut', 'p' => 'Gender=Fem,Masc', 'o' => 'Gender=Masc,Neut', 'r' => 'Gender=Fem,Neut' }, :case => {'n' => 'Case=Nom', 'a' => 'Case=Acc', # Obl(ique) is not universal 'o' => 'Case=Obl', 'g' => 'Case=Gen', 'c' => 'Case=Dat,Gen', 'e' => 'Case=Acc,Dat', 'd' => 'Case=Dat', 'b' => 'Case=Abl', 'i' => 'Case=Ins', 'l' => 'Case=Loc', 'v' => 'Case=Voc' }, :degree => {'p' => 'Degree=Pos', 'c' => 'Degree=Cmp', 's' => 'Degree=Sup' }, # The whole strength category is not universal :strength => {'w' => 'Strength=Weak', 's' => 'Strength=Strong'}, :inflection => {}, }
Class Method Summary collapse
Class Method Details
.process(tb, options = []) ⇒ Object
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# File 'lib/proiel/cli/converters/conll-u.rb', line 14 def process(tb, = []) error_count = 0 sentence_count = 0 tb.sources.each do |source| source.divs.each do |div| div.sentences.each do |sentence| sentence_count += 1 n = Sentence.new sentence begin # Do the conversion first to avoid spurious headers if the conversion fails a = n.convert.to_conll puts "# source = #{source.title}, #{div.title}" # using printable_form would give us punctuation, which must then be added to the tree puts "# text = #{sentence.tokens.map(&:form).compact.join(' ')}" puts "# sent_id = #{sentence.id}" puts a puts rescue => e error_count += 1 STDERR.puts "Cannot convert #{sentence.id} (#{sentence.citation}): #{e}" STDERR.puts e.backtrace.join("\n") unless e.is_a? RuntimeError end end end end STDERR.puts "#{error_count} sentences out of #{sentence_count} could not be converted" end |