Class: NLP::Lemmatizer
Class Method Summary collapse
- .lematize(text, method, input_type) ⇒ Object
- .morfeusz_lematize(text) ⇒ Object
- .parse_lematized_xml(doc) ⇒ Object
- .takipi_lematize(text, method) ⇒ Object
Class Method Details
.lematize(text, method, input_type) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/lemmatizer.rb', line 9 def self.lematize(text,method,input_type) if text.is_a? File str = text.read text.close elsif text.is_a? String str = text else raise ArgumentError, "Argument is not String or File" end if method === :takipi takipi_lematize(str,input_type) #Default lematization method is Morfeusz else morfeusz_lematize(str) end end |
.morfeusz_lematize(text) ⇒ Object
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/lemmatizer.rb', line 53 def self.morfeusz_lematize(text) temp_text = [] #simple tagger #TODO lematizer should take block or object Tagger that defines #how split string text.split(/\.|!|\?/).each do |s| sentence = Sentence.new sentence << s.split(" ").collect{ |t| if word = Morfeusz::Lexeme.find(t) if word[0] Word.new(t,word[0].base_form,"") else Word.new(t,"","") end else Word.new(t,"","") end } temp_text.push sentence end temp_text end |
.parse_lematized_xml(doc) ⇒ Object
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/lemmatizer.rb', line 78 def self.parse_lematized_xml(doc) text = Text.new doc.elements.each("*/chunkList/chunk") do |chunk| sentence = Sentence.new tokens = [] chunk.elements.each("tok") do |tok| word = tok.elements[1].text lemat, inflect = "" tok.elements.each("lex") do |lex| if lex.has_attributes? lemat = lex.elements[1].text inflect = lex.elements[2].text end end tokens << Word.new(word,lemat,inflect) end sentence << tokens text << sentence end text end |
.takipi_lematize(text, method) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/lemmatizer.rb', line 30 def self.takipi_lematize(text,method) if method === :local t1 = Thread.new do `takipi -i text.txt -o output.xml -it TXT` end t1.join f = File.open("output.xml","r") doc = Document.new f elsif method === :remote xml = TakipiWebService.request(text) doc = Document.new xml else raise ArgumentError, 'Argument is not :local or :remote' end parse_lematized_xml(doc) end |