Class: NLP::Lemmatizer

Inherits:
Object
  • Object
show all
Includes:
REXML
Defined in:
lib/tagger/lemmatizer.rb

Class Method Summary collapse

Class Method Details

.lemmatize(text, method = nil, input_type = nil) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/tagger/lemmatizer.rb', line 8

def self.lemmatize(text,method=nil,input_type=nil)
  if text.is_a? File
    str = text.read
    text.close
  elsif text.is_a? String
    str = text
  else
    raise ArgumentError, "Argument is not String or File"
  end

  if method === :takipi
    takipi_lemmatize(str,input_type)

  #Default lematization method is  Morfeusz 
  else 
    takipi_lemmatize(str,:remote)

    #morfeusz_lemmatize(str)
  end
end

.morfeusz_lemmatize(text) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/tagger/lemmatizer.rb', line 57

def self.morfeusz_lemmatize(text)
  temp_text = Text.new

  #simple tagger
  #TODO lemmatizer should take TokenScanner object that defines
  #how split string
  # text.split(/\.|!|\?/).each do |s|
  #   sentence = Sentence.new
  #   sentence << s.split(" ").collect{ |t|
  #     if word = Morfeusz::Lexeme.find(t)
  #       if word[0]
  #         Word.new(t,word[0].base_form,"") 
  #       else
  #         Word.new(t,"","")
  #       end
  #     else
  #       Word.new(t,"","")
  #     end
  #   }
  #   temp_text <<  sentence
  # end
  temp_text
end

.parse_lemmatized_xml(doc) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/tagger/lemmatizer.rb', line 82

def self.parse_lemmatized_xml(doc)

  text = Text.new

  doc.elements.each("*/chunkList/chunk") do |chunk| 
    sentence = Sentence.new
    tokens = []

    chunk.elements.each("tok") do |tok|
      word = tok.elements[1].text
      lemat, inflect = ""

      tok.elements.each("lex") do |lex|
        if lex.has_attributes?
          lemat = lex.elements[1].text
          inflect = lex.elements[2].text
        end
      end

      tokens << Word.new(word,lemat,inflect)
    end

    sentence << tokens
    text << sentence
  end
  text
end

.takipi_lemmatize(text, method) ⇒ Object



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/tagger/lemmatizer.rb', line 31

def self.takipi_lemmatize(text,method)

  if method === :local

    xml_file = TAKIPI_XML_FILE

    t1 = Thread.new do
      `echo '#{text}' > /tmp/text.txt; takipi -i /tmp/text.txt -o #{xml_file} -it TXT`
    end

    t1.join

    f = File.open(xml_file,"r")
    doc = Document.new f

  elsif method === :remote
    xml = TakipiWebService.request(text)
    doc = Document.new xml
  else
    raise ArgumentError, 'Argument is not :local or :remote'
  end

  parse_lemmatized_xml(doc)    
end