Class: NLP::Lemmatizer
- Inherits:
-
Object
- Object
- NLP::Lemmatizer
- Includes:
- REXML
- Defined in:
- lib/tagger/lemmatizer.rb
Class Method Summary collapse
- .lemmatize(text, method = nil, input_type = nil) ⇒ Object
- .morfeusz_lemmatize(text) ⇒ Object
- .parse_lemmatized_xml(doc) ⇒ Object
- .takipi_lemmatize(text, method) ⇒ Object
Class Method Details
.lemmatize(text, method = nil, input_type = nil) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/tagger/lemmatizer.rb', line 8 def self.lemmatize(text,method=nil,input_type=nil) if text.is_a? File str = text.read text.close elsif text.is_a? String str = text else raise ArgumentError, "Argument is not String or File" end if method === :takipi takipi_lemmatize(str,input_type) #Default lematization method is Morfeusz else takipi_lemmatize(str,:remote) #morfeusz_lemmatize(str) end end |
.morfeusz_lemmatize(text) ⇒ Object
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/tagger/lemmatizer.rb', line 57 def self.morfeusz_lemmatize(text) temp_text = Text.new #simple tagger #TODO lemmatizer should take TokenScanner object that defines #how split string # text.split(/\.|!|\?/).each do |s| # sentence = Sentence.new # sentence << s.split(" ").collect{ |t| # if word = Morfeusz::Lexeme.find(t) # if word[0] # Word.new(t,word[0].base_form,"") # else # Word.new(t,"","") # end # else # Word.new(t,"","") # end # } # temp_text << sentence # end temp_text end |
.parse_lemmatized_xml(doc) ⇒ Object
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# File 'lib/tagger/lemmatizer.rb', line 82 def self.parse_lemmatized_xml(doc) text = Text.new doc.elements.each("*/chunkList/chunk") do |chunk| sentence = Sentence.new tokens = [] chunk.elements.each("tok") do |tok| word = tok.elements[1].text lemat, inflect = "" tok.elements.each("lex") do |lex| if lex.has_attributes? lemat = lex.elements[1].text inflect = lex.elements[2].text end end tokens << Word.new(word,lemat,inflect) end sentence << tokens text << sentence end text end |
.takipi_lemmatize(text, method) ⇒ Object
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/tagger/lemmatizer.rb', line 31 def self.takipi_lemmatize(text,method) if method === :local xml_file = TAKIPI_XML_FILE t1 = Thread.new do `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT` end t1.join f = File.open(xml_file,"r") doc = Document.new f elsif method === :remote xml = TakipiWebService.request(text) doc = Document.new xml else raise ArgumentError, 'Argument is not :local or :remote' end parse_lemmatized_xml(doc) end |