Class: TextProcessing

Inherits:
Object
  • Object
show all
Defined in:
lib/lumix/textprocessing.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(lang = 'ro') ⇒ TextProcessing

Returns a new instance of TextProcessing.



13
14
15
# File 'lib/lumix/textprocessing.rb', line 13

def initialize(lang = 'ro')
  @lang = lang
end

Instance Attribute Details

#langObject

Returns the value of attribute lang.



11
12
13
# File 'lib/lumix/textprocessing.rb', line 11

def lang
  @lang
end

Instance Method Details

#cleanup(file) ⇒ Object



36
37
38
39
# File 'lib/lumix/textprocessing.rb', line 36

def cleanup(file)
  @entities ||= HTMLEntities.new
  @entities.decode()
end

#create_tagged_filename(infile) ⇒ Object

inserts “tagged” as the second to last part in the filename and as parent folder e.g.

test.txt -> tagged/test.tagged.txt

special case when no extension is present:

README -> README.tagged


46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/lumix/textprocessing.rb', line 46

def create_tagged_filename(infile)
  path = infile.split(/\//)

  # take care of the filename...
  components = path.pop.split(/\./)
  position = [1, components.size-1].max
  components.insert position, 'tagged'
  path.push components.join('.')

  # ...and of the path
  path.insert -2, 'tagged'
  path.join '/'
end

#process(text) ⇒ Object

the core processing routing using the webservice



27
28
29
30
31
32
33
34
# File 'lib/lumix/textprocessing.rb', line 27

def process(text)
  response = rpc.Process(:input => text.to_utf, :lang => lang)
  response.processResult
#    response = rpc.request(:process) do
#      soap.body = {:input => text, :lang => lang}
#    end
#    response.to_hash[:process_response][:process_result]
end

#process_file(infile, outfile = create_tagged_filename(infile)) ⇒ Object

takes the text from infile and outputs the result into the outfile



76
77
78
79
80
81
# File 'lib/lumix/textprocessing.rb', line 76

def process_file(infile, outfile = create_tagged_filename(infile))
  result = process(File.read(file).to_utf)
  File.open(outfile, 'w') do |out|
    out.write result
  end
end

#process_stdinObject



71
72
73
# File 'lib/lumix/textprocessing.rb', line 71

def process_stdin
  puts process($stdin.read)
end

#rpcObject



17
18
19
20
21
22
23
24
# File 'lib/lumix/textprocessing.rb', line 17

def rpc
#    Thread.current[:rpc] ||= begin
#      wsdl = SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
#      wsdl.create_rpc_driver
#      Savon::Client.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL')
#    end
  @rpc ||= SOAP::WSDLDriverFactory.new('http://www.racai.ro/webservices/TextProcessing.asmx?WSDL').create_rpc_driver
end

#to_filelist(*files) ⇒ Object



60
61
62
63
64
65
66
67
68
69
# File 'lib/lumix/textprocessing.rb', line 60

def to_filelist(*files)
  files = files.flatten.map do |filename|
    if File.directory?  filename
      Dir.glob File.join(filename, '**/*') # add all files from that directory
    else
      filename
    end
  end.flatten.compact.uniq # make sure every file is only processed once
  files.delete_if { |filename| File.directory?(filename) ||  filename['.tagged']} # remove remaining folders
end