Class: Biblionet::Extractors::CategoryExtractor
- Defined in:
- lib/bookshark/extractors/category_extractor.rb
Instance Attribute Summary collapse
-
#categories ⇒ Object
readonly
Returns the value of attribute categories.
Attributes inherited from Base
#biblionet_id, #filepath, #page, #url
Instance Method Summary collapse
- #extract_categories(category_page = @page) ⇒ Object
- #extract_categories_from(uri = nil) ⇒ Object
-
#initialize(uri = nil) ⇒ CategoryExtractor
constructor
A new instance of CategoryExtractor.
Methods inherited from Base
#decode_text, decode_text, #load_page, #load_page_from_file, #load_page_from_url, #present?, #save_page
Methods included from FileManager
#list_directories, #list_files, #save_to
Constructor Details
#initialize(uri = nil) ⇒ CategoryExtractor
Returns a new instance of CategoryExtractor.
12 13 14 15 |
# File 'lib/bookshark/extractors/category_extractor.rb', line 12 def initialize(uri=nil) super(uri) extract_categories unless uri.nil? or @page.nil? end |
Instance Attribute Details
#categories ⇒ Object (readonly)
Returns the value of attribute categories.
10 11 12 |
# File 'lib/bookshark/extractors/category_extractor.rb', line 10 def categories @categories end |
Instance Method Details
#extract_categories(category_page = @page) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/bookshark/extractors/category_extractor.rb', line 17 def extract_categories(category_page=@page) page = Nokogiri::HTML(category_page) parent, previous_indent, previous_id = nil, nil, nil, @categories = page.xpath("//a[@class='menu' and @href[contains(.,'/index/') ]]").map do |category| # Extract from href the id used by biblionet. --- DdC url http://biblionet.gr/index/id --- biblionet_id = category[:href].split(/\//).last # Get the text before <a>. It is expected to be a number of space characters spaces = category.previous_sibling.text # TODO: make sure text is only spaces # Indent size indent = spaces.size # Determine parent-child-sibling relationships based on indent. # Indent size seems to be inconsistent, so it better to compare sizes than actually use them. if (indent <=> previous_indent).nil? previous_indent = indent elsif (indent <=> previous_indent)>0 parent = previous_id previous_indent = indent end previous_id = biblionet_id # Extact DdC id and DdC text. category = proccess_category(category.text) category.merge!(parent: parent) category_hash = {biblionet_id => category.clone} end.reduce({}, :update) unless @page.nil? if present?(@categories) @categories[:current] = (@categories[@biblionet_id.to_s].clone) @categories[:current][:b_id] = @biblionet_id return @categories else return nil end end |
#extract_categories_from(uri = nil) ⇒ Object
58 59 60 61 |
# File 'lib/bookshark/extractors/category_extractor.rb', line 58 def extract_categories_from(uri=nil) load_page(uri) extract_categories unless uri.nil? or @page.nil? end |