Module: WebParser

Defined in:
lib/web_parser/web_parser.rb

Constant Summary collapse

DEFAULT_OPTIONS =
{
  'max_workers'=>5,
  'queue'=>{
    'server'=>'127.0.0.1:22122',
    'in_queue' => 'in_queue',
	'out_queue' => 'out_queue',
	'wait_time' => 1
    }
}

Class Method Summary collapse

Class Method Details

.extract(xhtml, template) ⇒ Object



39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/web_parser/web_parser.rb', line 39

def extract(xhtml,template)
  template = Template.load_template(template) unless template.instance_of?(Hash)  # 如果给定的template不是哈希,则认为是模板文件进行加载
  dom = xhtml_to_dom(xhtml)
  objects = template.inject({}) do |result,object|
    object_name,object_attributes = object[0],object[1]
    object_result = case object_attributes['type']
      when 'single'
        extract_single(dom,object_attributes)           
      when 'list'
        extract_list(dom,object_attributes)
      when 'special'
        extract_special(dom,object_attributes)
      else
        nil
    end
    result.merge!(object_name=>object_result)
  end
end

.extract_from_file(web_file, template, options = {}) ⇒ Object



63
64
65
66
# File 'lib/web_parser/web_parser.rb', line 63

def extract_from_file(web_file,template,options={})
  doc = get_doc_from_file(web_file,options)
  extract(doc,template)
end

.extract_from_url(url, template, options = {}) ⇒ Object



58
59
60
61
# File 'lib/web_parser/web_parser.rb', line 58

def extract_from_url(url,template,options={})
  doc = get_doc_from_url(url,options)
  extract(doc,template)
end