Module: DhEasy::Text
- Defined in:
- lib/dh_easy/text.rb,
lib/dh_easy/text/version.rb
Constant Summary collapse
- VERSION =
Gem version
"0.0.6"
Class Method Summary collapse
-
.decode_html(text) ⇒ String
Decode HTML entities from text .
-
.default_parser(cell_element, data, key) ⇒ Object
Default cell content parser used to parse cell element.
-
.encode_html(text) ⇒ String
Encode text for valid HTML entities.
-
.hash(object) ⇒ String
Create a hash from object.
-
.parse_content(opts) {|data, row, header_map| ... } ⇒ Array<Hash>?
Parse row data matching a selector using a header map to translate between columns and friendly keys.
-
.parse_header_map(opts = {}) ⇒ Hash{Symbol,String => Integer}?
Parse header from selector and create a header map to match a column key with column index.
-
.parse_table(opts = {}) {|data, row, header_map| ... } ⇒ Hash{Symbol => Array,Hash,nil}
Parse data from a horizontal table like structure matching a selectors and using a header map to match columns.
-
.parse_vertical_table(opts = {}) {|data, row, header_map| ... } ⇒ Hash{Symbol => Array,Hash,nil}
Parse data from a vertical table like structure matching a selectors and using a header map to match columns.
-
.strip(raw_text, orig_encoding = 'ASCII') ⇒ String?
Strip a value by trimming spaces, reducing secuential spaces into a single space, decode HTML entities and change encoding to UTF-8.
-
.translate_label_to_key(element, label_map) ⇒ Symbol, String
Extract column label and translate it into a frienly key.
Class Method Details
.decode_html(text) ⇒ String
Decode HTML entities from text .
33 34 35 |
# File 'lib/dh_easy/text.rb', line 33 def self.decode_html text CGI.unescapeHTML text end |
.default_parser(cell_element, data, key) ⇒ Object
Default cell content parser used to parse cell element.
62 63 64 65 66 |
# File 'lib/dh_easy/text.rb', line 62 def self.default_parser cell_element, data, key return if cell_element.nil? cell_element.search('//i').remove if cell_element.search('//i').count > 0 data[key] = strip cell_element.text end |
.encode_html(text) ⇒ String
Encode text for valid HTML entities.
24 25 26 |
# File 'lib/dh_easy/text.rb', line 24 def self.encode_html text CGI.escapeHTML text end |
.hash(object) ⇒ String
Create a hash from object
14 15 16 17 |
# File 'lib/dh_easy/text.rb', line 14 def self.hash object object = object.hash if object.is_a? Hash Digest::SHA1.hexdigest object.to_s end |
.parse_content(opts) {|data, row, header_map| ... } ⇒ Array<Hash>?
Parse row data matching a selector using a header map to translate
between columns and friendly keys.
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/dh_easy/text.rb', line 89 def self.parse_content opts, &filter opts = { html: nil, selector: nil, first_row_header: false, header_map: {}, column_parsers: {}, ignore_text_nodes: true }.merge opts # Setup config data = [] row_data = child_element = nil first = first_row_header = opts[:first_row_header] header_map = opts[:header_map] column_parsers = opts[:column_parsers] ignore_text_nodes = opts[:ignore_text_nodes] # Get and parse rows html_rows = opts[:html].css(opts[:selector]) html_rows.each do |row| next if ignore_text_nodes && row.name == 'text' # First row header validation if first && first_row_header first = false next end # Extract content data row_data = {} header_map.each do |key, index| # Parse column html with default or custom parser children = row.children children = children.select{|i|i.name != 'text'} if ignore_text_nodes child_element = children[index] column_parsers[key].nil? ? default_parser(child_element, row_data, key) : column_parsers[key].call(child_element, row_data, key) end next unless filter.nil? || filter.call(row_data, row, header_map) data << row_data end data end |
.parse_header_map(opts = {}) ⇒ Hash{Symbol,String => Integer}?
Parse header from selector and create a header map to match a column key
with column index.
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
# File 'lib/dh_easy/text.rb', line 166 def self.parse_header_map opts = {} opts = { html: nil, selector: nil, column_key_label_map: {}, first_row_header: false, ignore_text_nodes: true }.merge opts # Setup config dictionary = opts[:column_key_label_map] ignore_text_nodes = opts[:ignore_text_nodes] data = [] column_map = nil # Extract and parse header rows html_rows = opts[:html].css(opts[:selector]) rescue nil return nil if html_rows.nil? html_rows = [html_rows.first] if opts[:first_row_header] html_rows.each do |row| next if ignore_text_nodes && row.name == 'text' column_map = {} children = row.children children = children.select{|i|i.name != 'text'} if ignore_text_nodes children.each_with_index do |col, index| # Parse and map column header column_key = translate_label_to_key col, dictionary next if column_key.nil? column_map[column_key] = index end data << column_map end data&.first end |
.parse_table(opts = {}) {|data, row, header_map| ... } ⇒ Hash{Symbol => Array,Hash,nil}
Parse data from a horizontal table like structure matching a selectors and
using a header map to match columns.
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
# File 'lib/dh_easy/text.rb', line 226 def self.parse_table opts = {}, &filter opts = { html: nil, header_selector: nil, header_key_label_map: {}, content_selector: nil, first_row_header: false, column_parsers: {}, ignore_text_nodes: true }.merge opts return nil if opts[:html].nil? header_map = self.parse_header_map html: opts[:html], selector: opts[:header_selector], column_key_label_map: opts[:header_key_label_map], first_row_header: opts[:first_row_header], ignore_text_nodes: opts[:ignore_text_nodes] return nil if header_map.nil? data = self.parse_content html: opts[:html], selector: opts[:content_selector], header_map: header_map, first_row_header: opts[:first_row_header], column_parsers: opts[:column_parsers], ignore_text_nodes: opts[:ignore_text_nodes], &filter {header_map: header_map, data: data} end |
.parse_vertical_table(opts = {}) {|data, row, header_map| ... } ⇒ Hash{Symbol => Array,Hash,nil}
Parse data from a vertical table like structure matching a selectors and
using a header map to match columns.
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 |
# File 'lib/dh_easy/text.rb', line 276 def self.parse_vertical_table opts = {}, &filter opts = { html: nil, row_selector: nil, header_selector: nil, header_key_label_map: {}, content_selector: nil, column_parsers: {}, ignore_text_nodes: true }.merge opts return nil if opts[:html].nil? # Setup config data = {} dictionary = opts[:header_key_label_map] column_parsers = opts[:column_parsers] # Extract headers and content html_rows = opts[:html].css(opts[:row_selector]) rescue nil return nil if html_rows.nil? html_rows.each do |row| # Parse and map column header header_element = row.css(opts[:header_selector]) key = translate_label_to_key header_element, dictionary next if key.nil? || key == '' # Parse column html with default or custom parser content_element = row.css(opts[:content_selector]) column_parsers[key].nil? ? default_parser(content_element, data, key) : column_parsers[key].call(content_element, data, key) end data end |
.strip(raw_text, orig_encoding = 'ASCII') ⇒ String?
Strip a value by trimming spaces, reducing secuential spaces into a
single space, decode HTML entities and change encoding to UTF-8.
44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/dh_easy/text.rb', line 44 def self.strip raw_text, orig_encoding = 'ASCII' return nil if raw_text.nil? raw_text = raw_text.to_s unless raw_text.is_a? String regex = /(\s|\u3000|\u00a0)+/ good_encoding = (raw_text =~ /\u3000/ || true) rescue false unless good_encoding raw_text = raw_text.force_encoding(orig_encoding).encode('UTF-8', invalid: :replace, undef: :replace) regex = /(\s|\u3000|\u00a0|\u00c2\u00a0)+/ end text = raw_text.gsub(regex, ' ').strip text.nil? ? nil : decode_html(text) end |
.translate_label_to_key(element, label_map) ⇒ Symbol, String
Extract column label and translate it into a frienly key.
142 143 144 145 146 147 148 149 150 |
# File 'lib/dh_easy/text.rb', line 142 def self.translate_label_to_key element, label_map return nil if element.nil? element.search('//i').remove if element.search('//i').count > 0 text = strip element.text key_pair = label_map.find do |k,v| v.is_a?(Regexp) ? (text =~ v) : (text == v) end key = key_pair.nil? ? nil : key_pair[0] end |