Class: NLP::Lemmatizer

Inherits:
Object
  • Object
show all
Includes:
REXML
Defined in:
lib/lemmatizer.rb

Class Method Summary collapse

Class Method Details

.lematize(text, method, input_type) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/lemmatizer.rb', line 9

def self.lematize(text,method,input_type)
    if text.is_a? File
        str = text.read
        text.close
    elsif text.is_a? String
       str = text
    else
        raise ArgumentError, "Argument is not String or File"
    end

    if method === :takipi
        takipi_lematize(str,input_type)
    #Default lematization method is  Morfeusz 
    else 
        morfeusz_lematize(str)
    end

end

.morfeusz_lematize(text) ⇒ Object



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/lemmatizer.rb', line 53

def self.morfeusz_lematize(text)
     temp_text = []
     
     #simple tagger
     #TODO lematizer should take block or object Tagger that defines
     #how split string
     text.split(/\.|!|\?/).each do |s|
         sentence = Sentence.new
         sentence << s.split(" ").collect{ |t|
             if word = Morfeusz::Lexeme.find(t)
                if word[0]
                     Word.new(t,word[0].base_form,"") 
                else
                     Word.new(t,"","")
                end
             else
                 Word.new(t,"","")
             end
         }
         temp_text.push  sentence
    end
    temp_text
end

.parse_lematized_xml(doc) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/lemmatizer.rb', line 78

def self.parse_lematized_xml(doc)
    
    text = Text.new

    doc.elements.each("*/chunkList/chunk") do |chunk| 
        sentence = Sentence.new
        tokens = []

        chunk.elements.each("tok") do |tok|
           word = tok.elements[1].text
           lemat, inflect = ""

           tok.elements.each("lex") do |lex|
                if lex.has_attributes?
                    lemat = lex.elements[1].text
                    inflect = lex.elements[2].text
                end
           end
          
           tokens << Word.new(word,lemat,inflect)
        end

        sentence << tokens
        text << sentence
    end
    text
end

.takipi_lematize(text, method) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/lemmatizer.rb', line 30

def self.takipi_lematize(text,method)
    
    if method === :local

        t1 = Thread.new do 
             `takipi -i text.txt -o output.xml -it TXT`
        end

        t1.join

        f = File.open("output.xml","r")
        doc = Document.new f
    elsif method === :remote
        xml = TakipiWebService.request(text)
        doc = Document.new xml
    else
        raise ArgumentError, 'Argument is not :local or :remote'
    end

    parse_lematized_xml(doc)    
end