Module: DhEasy::Text

Defined in:
lib/dh_easy/text.rb,
lib/dh_easy/text/version.rb

Constant Summary collapse

VERSION =

Gem version

"0.0.6"

Class Method Summary collapse

Class Method Details

.decode_html(text) ⇒ String

Decode HTML entities from text .

Parameters:

  • text (String)

    Text to decode.

Returns:

  • (String)


33
34
35
# File 'lib/dh_easy/text.rb', line 33

def self.decode_html text
  CGI.unescapeHTML text
end

.default_parser(cell_element, data, key) ⇒ Object

Default cell content parser used to parse cell element.

Parameters:

  • cell_element (Nokogiri::Element)

    Cell element to parse.

  • data (Hash)

    Data hash to save parsed data into.

  • key (String, Symbol)

    Header column key being parsed.



62
63
64
65
66
# File 'lib/dh_easy/text.rb', line 62

def self.default_parser cell_element, data, key
  return if cell_element.nil?
  cell_element.search('//i').remove if cell_element.search('//i').count > 0
  data[key] = strip cell_element.text
end

.encode_html(text) ⇒ String

Encode text for valid HTML entities.

Parameters:

  • text (String)

    Text to encode.

Returns:

  • (String)


24
25
26
# File 'lib/dh_easy/text.rb', line 24

def self.encode_html text
  CGI.escapeHTML text
end

.hash(object) ⇒ String

Create a hash from object

Parameters:

  • object (String, Hash, Object)

    Object to create hash from.

Returns:

  • (String)


14
15
16
17
# File 'lib/dh_easy/text.rb', line 14

def self.hash object
  object = object.hash if object.is_a? Hash
  Digest::SHA1.hexdigest object.to_s
end

.parse_content(opts) {|data, row, header_map| ... } ⇒ Array<Hash>?

Parse row data matching a selector using a header map to translate

between columns and friendly keys.

Parameters:

  • opts (Hash)

    ({}) Configuration options.

Options Hash (opts):

  • :html (Nokogiri::Element)

    Container element to search into.

  • :selector (String)

    CSS selector to match content cells.

  • :first_row_header (Boolean) — default: false

    If true then first matching element will be assumed to be header and ignored.

  • :header_map (Hash{Symbol,String => Integer})

    Header key vs index dictionary.

  • :column_parsers (Hash{Symbol,String => lambda,proc}) — default: {}

    Custom column parsers for advance data extraction.

  • :ignore_text_nodes (Boolean) — default: true

    Ignore text nodes when retriving content cells and rows.

Yield Parameters:

  • data (Hash{Symbol,String => Object})

    Parsed row data.

  • row (Array)

    Raw row data.

  • header_map (Hash{Symbol,String => Integer})

    Header map used.

Yield Returns:

  • (Boolean)

    ‘true` when valid, else `false`.

Returns:

  • (Array<Hash>, nil)

    Parsed rows data.



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/dh_easy/text.rb', line 89

def self.parse_content opts, &filter
  opts = {
    html: nil,
    selector: nil,
    first_row_header: false,
    header_map: {},
    column_parsers: {},
    ignore_text_nodes: true
  }.merge opts

  # Setup config
  data = []
  row_data = child_element = nil
  first = first_row_header = opts[:first_row_header]
  header_map = opts[:header_map]
  column_parsers = opts[:column_parsers]
  ignore_text_nodes = opts[:ignore_text_nodes]

  # Get and parse rows
  html_rows = opts[:html].css(opts[:selector])
  html_rows.each do |row|
    next if ignore_text_nodes && row.name == 'text'

    # First row header validation
    if first && first_row_header
      first = false
      next
    end

    # Extract content data
    row_data = {}
    header_map.each do |key, index|
      # Parse column html with default or custom parser
      children = row.children
      children = children.select{|i|i.name != 'text'} if ignore_text_nodes
      child_element = children[index]
      column_parsers[key].nil? ?
        default_parser(child_element, row_data, key) :
        column_parsers[key].call(child_element, row_data, key)
    end
    next unless filter.nil? || filter.call(row_data, row, header_map)
    data << row_data
  end
  data
end

.parse_header_map(opts = {}) ⇒ Hash{Symbol,String => Integer}?

Parse header from selector and create a header map to match a column key

with column index.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options.

Options Hash (opts):

  • :html (Nokogiri::Element)

    Container element to search into.

  • :selector (String)

    CSS selector to match header cells.

  • :column_key_label_map (Hash{Symbol,String => Regex,String})

    Key vs. label dictionary.

  • :first_row_header (Boolean) — default: false

    If true then selector first matching row will be used as header for parsing.

  • :ignore_text_nodes (Boolean) — default: true

    Ignore text nodes when retriving header cells and rows.

Returns:

  • (Hash{Symbol,String => Integer}, nil)

    Key vs. column index map.



166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/dh_easy/text.rb', line 166

def self.parse_header_map opts = {}
  opts = {
    html: nil,
    selector: nil,
    column_key_label_map: {},
    first_row_header: false,
    ignore_text_nodes: true
  }.merge opts

  # Setup config
  dictionary = opts[:column_key_label_map]
  ignore_text_nodes = opts[:ignore_text_nodes]
  data = []
  column_map = nil

  # Extract and parse header rows
  html_rows = opts[:html].css(opts[:selector]) rescue nil
  return nil if html_rows.nil?
  html_rows = [html_rows.first] if opts[:first_row_header]
  html_rows.each do |row|
    next if ignore_text_nodes && row.name == 'text'

    column_map = {}
    children = row.children
    children = children.select{|i|i.name != 'text'} if ignore_text_nodes
    children.each_with_index do |col, index|
      # Parse and map column header
      column_key = translate_label_to_key col, dictionary
      next if column_key.nil?
      column_map[column_key] = index
    end
    data << column_map
  end
  data&.first
end

.parse_table(opts = {}) {|data, row, header_map| ... } ⇒ Hash{Symbol => Array,Hash,nil}

Parse data from a horizontal table like structure matching a selectors and

using a header map to match columns.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options.

Options Hash (opts):

  • :html (Nokogiri::Element)

    Container element to search into.

  • :header_selector (String)

    Header column elements selector.

  • :header_key_label_map (Hash{Symbol,String => Regex,String})

    Header key vs. label dictionary to match column indexes.

  • :content_selector (String)

    Content row elements selector.

  • :first_row_header (Boolean) — default: false

    If true then selector first matching row will be used as header for parsing.

  • :column_parsers (Hash{Symbol,String => lambda,proc}) — default: {}

    Custom column parsers for advance data extraction.

  • :ignore_text_nodes (Boolean) — default: true

    Ignore text nodes when retriving cells and rows.

Yield Parameters:

  • data (Hash{Symbol,String => Object})

    Parsed content row data.

  • row (Array)

    Raw content row data.

  • header_map (Hash{Symbol,String => Integer})

    Header map used.

Yield Returns:

  • (Boolean)

    ‘true` when valid, else `false`.

Returns:

  • (Hash{Symbol => Array,Hash,nil})

    Hash data is as follows:

    • ‘[Hash] :header_map` Header map used.

    • ‘[Array<Hash>,nil] :data` Parsed rows data.



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# File 'lib/dh_easy/text.rb', line 226

def self.parse_table opts = {}, &filter
  opts = {
    html: nil,
    header_selector: nil,
    header_key_label_map: {},
    content_selector: nil,
    first_row_header: false,
    column_parsers: {},
    ignore_text_nodes: true
  }.merge opts
  return nil if opts[:html].nil?
  header_map = self.parse_header_map html: opts[:html],
    selector: opts[:header_selector],
    column_key_label_map: opts[:header_key_label_map],
    first_row_header: opts[:first_row_header],
    ignore_text_nodes: opts[:ignore_text_nodes]
  return nil if header_map.nil?
  data = self.parse_content html: opts[:html],
    selector: opts[:content_selector],
    header_map: header_map,
    first_row_header: opts[:first_row_header],
    column_parsers: opts[:column_parsers],
    ignore_text_nodes: opts[:ignore_text_nodes],
    &filter
  {header_map: header_map, data: data}
end

.parse_vertical_table(opts = {}) {|data, row, header_map| ... } ⇒ Hash{Symbol => Array,Hash,nil}

Parse data from a vertical table like structure matching a selectors and

using a header map to match columns.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options.

Options Hash (opts):

  • :html (Nokogiri::Element)

    Container element to search into.

  • :row_selector (String)

    Vertical row like elements selector.

  • :header_selector (String)

    Header column elements selector.

  • :header_key_label_map (Hash{Symbol,String => Regex,String})

    Header key vs. label dictionary to match column indexes.

  • :content_selector (String)

    Content row elements selector.

  • :column_parsers (Hash{Symbol,String => lambda,proc}) — default: {}

    Custom column parsers for advance data extraction.

  • :ignore_text_nodes (Boolean) — default: true

    Ignore text nodes when retriving cells and rows.

Yield Parameters:

  • data (Hash{Symbol,String => Object})

    Parsed content row data.

  • row (Array)

    Raw content row data.

  • header_map (Hash{Symbol,String => Integer})

    Header map used.

Yield Returns:

  • (Boolean)

    ‘true` when valid, else `false`.

Returns:

  • (Hash{Symbol => Array,Hash,nil})

    Hash data is as follows:

    • ‘[Hash] :header_map` Header map used.

    • ‘[Array<Hash>,nil] :data` Parsed rows data.



276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
# File 'lib/dh_easy/text.rb', line 276

def self.parse_vertical_table opts = {}, &filter
  opts = {
    html: nil,
    row_selector: nil,
    header_selector: nil,
    header_key_label_map: {},
    content_selector: nil,
    column_parsers: {},
    ignore_text_nodes: true
  }.merge opts
  return nil if opts[:html].nil?

  # Setup config
  data = {}
  dictionary = opts[:header_key_label_map]
  column_parsers = opts[:column_parsers]

  # Extract headers and content
  html_rows = opts[:html].css(opts[:row_selector]) rescue nil
  return nil if html_rows.nil?
  html_rows.each do |row|
    # Parse and map column header
    header_element = row.css(opts[:header_selector])
    key = translate_label_to_key header_element, dictionary
    next if key.nil? || key == ''

    # Parse column html with default or custom parser
    content_element = row.css(opts[:content_selector])
    column_parsers[key].nil? ?
      default_parser(content_element, data, key) :
      column_parsers[key].call(content_element, data, key)
  end
  data
end

.strip(raw_text, orig_encoding = 'ASCII') ⇒ String?

Strip a value by trimming spaces, reducing secuential spaces into a

single space, decode HTML entities and change encoding to UTF-8.

Parameters:

  • raw_text (String, Object, nil)

    Text to strip.

  • orig_encoding (String) (defaults to: 'ASCII')

    Text original encoding.

Returns:

  • (String, nil)

    ‘nil` when raw_text is nil, else `String`.



44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/dh_easy/text.rb', line 44

def self.strip raw_text, orig_encoding = 'ASCII'
  return nil if raw_text.nil?
  raw_text = raw_text.to_s unless raw_text.is_a? String
  regex = /(\s|\u3000|\u00a0)+/
  good_encoding = (raw_text =~ /\u3000/ || true) rescue false
  unless good_encoding
    raw_text = raw_text.force_encoding(orig_encoding).encode('UTF-8', invalid: :replace, undef: :replace)
    regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/
  end
  text = raw_text.gsub(regex, ' ').strip
  text.nil? ? nil : decode_html(text)
end

.translate_label_to_key(element, label_map) ⇒ Symbol, String

Extract column label and translate it into a frienly key.

Parameters:

  • element (Nokogiri::Element)

    Html element to parse.

  • label_map (Hash{Symbol,String => Regex,String})

    Label dictionary for translation into key.

Returns:

  • (Symbol, String)

    Translated key.



142
143
144
145
146
147
148
149
150
# File 'lib/dh_easy/text.rb', line 142

def self.translate_label_to_key element, label_map
  return nil if element.nil?
  element.search('//i').remove if element.search('//i').count > 0
  text = strip element.text
  key_pair = label_map.find do |k,v|
    v.is_a?(Regexp) ? (text =~ v) : (text == v)
  end
  key = key_pair.nil? ? nil : key_pair[0]
end