Module: Awestruct::ContextHelper
- Defined in:
- lib/awestruct/context_helper.rb
Instance Method Summary collapse
- #clean_html(str) ⇒ Object
- #close_tags(s) ⇒ Object
- #fix_url(base_url, url) ⇒ Object
- #fully_qualify_urls(base_url, text) ⇒ Object
- #html_to_text(str) ⇒ Object
- #summarize(text, numwords = 20, ellipsis = '...') ⇒ Object
- #without_images(str) ⇒ Object
Instance Method Details
#clean_html(str) ⇒ Object
11 12 13 |
# File 'lib/awestruct/context_helper.rb', line 11 def clean_html(str) str.gsub( / /, ' ' ) end |
#close_tags(s) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/awestruct/context_helper.rb', line 19 def (s) stack = [] s.scan(/<\/?[^>]+>/).each do |tag| if tag[1] != '/' tag = tag[1..-1].scan(/\w+/).first stack = [ tag ] + stack else tag = tag[2..-1].scan(/\w+/).first if stack[0] == tag stack = stack.drop(1) else raise "Malformed HTML expected #{tag[0]} but got #{tag} '#{s}'" end end end stack.inject(s) { |memo,tag| memo += "</#{tag}>" } end |
#fix_url(base_url, url) ⇒ Object
70 71 72 73 |
# File 'lib/awestruct/context_helper.rb', line 70 def fix_url(base_url, url) return url unless ( url =~ /^\// ) "#{base_url}#{url}" end |
#fully_qualify_urls(base_url, text) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/awestruct/context_helper.rb', line 41 def fully_qualify_urls(base_url, text) begin doc = Oga.parse_xml text doc.each_node do |elem| if elem.is_a?(Oga::XML::Element) case elem.name when 'a' elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href') when 'link' elem.set 'href', fix_url(base_url, elem.get('href')) if elem.get('href') when 'img' elem.set 'src', fix_url(base_url, elem.get('src')) if elem.get('src') end end end doc.to_xml.tap do |d| d.force_encoding(text.encoding) if d.encoding != text.encoding end rescue => e Awestruct::ExceptionHelper.log_error e $LOG.info %Q(If the error has to do with 'end of input' ensure none of the following tags have a closing tag: #{Oga::XML::HTML_VOID_ELEMENTS.to_a.collect {|a| a.downcase}.uniq.join(', ')}) if $LOG.info? $LOG.warn "Text being parsed:\n#{text}" if $LOG.warn? text # returning the bad text, which hopefully will help find the cause end end |
#html_to_text(str) ⇒ Object
7 8 9 |
# File 'lib/awestruct/context_helper.rb', line 7 def html_to_text(str) str.gsub( /<[^>]+>/, '' ).gsub( / /, ' ' ) end |
#summarize(text, numwords = 20, ellipsis = '...') ⇒ Object
37 38 39 |
# File 'lib/awestruct/context_helper.rb', line 37 def summarize(text, numwords=20, ellipsis='...') (text.split(/ /)[0, numwords].join(' ') + ellipsis) end |
#without_images(str) ⇒ Object
15 16 17 |
# File 'lib/awestruct/context_helper.rb', line 15 def without_images(str) str.gsub(/<img[^>]+>/,'').gsub(/<a[^>]+>([^<]*)<\/a>/, '\1') end |