Class: Htmlarticle
- Inherits:
-
Object
- Object
- Htmlarticle
- Defined in:
- lib/models/htmlarticle.rb
Class Method Summary collapse
- .get_desp(n, desp_buff, html_replacer, html_replacer_for_no_tag_line) ⇒ Object
-
.get_html_content(params) ⇒ Object
参数说明 doc 源代码 必填参数 content_selector 正文规则 必填参数 content_replacer 正文替换正则 content_filter 正文过滤 content_rid_html_selector 正文剔除html标签 html_replacer html换行标签 html_replacer_for_no_tag_line 无标签文字是否按照同级换行标签换行 0 不处理 1 换行处理 params = doc:doc,content_selector:content_selector,content_rid_html_selector:content_rid_html_selector,html_replacer:html_replacer,html_replacer_for_no_tag_line:html_replacer_for_no_tag_line,content_replacer:content_replacer,content_filter:content_filter 示例用法 doc = Nokogiri::HTML(res.body) content_selector = “div.content” html_replacer = “p” params = doc:doc,content_selector:content_selector,html_replacer:html_replacer desp,html_content = Htmlarticle.get_html_content(params).
Instance Method Summary collapse
- #content ⇒ Object
-
#initialize(text, options = {}) ⇒ Htmlarticle
constructor
A new instance of Htmlarticle.
- #parse ⇒ Object
- #title ⇒ Object
Constructor Details
#initialize(text, options = {}) ⇒ Htmlarticle
Returns a new instance of Htmlarticle.
3 4 5 6 7 |
# File 'lib/models/htmlarticle.rb', line 3 def initialize(text, = {}) @text = text @options = @content = "" end |
Class Method Details
.get_desp(n, desp_buff, html_replacer, html_replacer_for_no_tag_line) ⇒ Object
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/models/htmlarticle.rb', line 93 def self.get_desp(n,desp_buff,html_replacer,html_replacer_for_no_tag_line) html_replacer = html_replacer if html_replacer.count > 0 && html_replacer[0].present? if html_replacer.include? n.name desp_buff += "\n" end end if n.name == "text" if html_replacer_for_no_tag_line == 1 && (html_replacer.count > 0 && html_replacer[0].present?) if n.parent.first_element_child != n if html_replacer.include? n.previous_sibling.try(:name) desp_buff += "\n" end end end desp_buff += n.inner_text.gsub("\n"," ") if n.inner_text.present? if !(html_replacer.count > 0 && html_replacer[0].present?) desp_buff += "\n" end end if n.children.present? n.children.each do |c| desp_buff = get_desp(c,desp_buff,html_replacer,html_replacer_for_no_tag_line) end end return desp_buff end |
.get_html_content(params) ⇒ Object
参数说明 doc 源代码 必填参数 content_selector 正文规则 必填参数 content_replacer 正文替换正则 content_filter 正文过滤 content_rid_html_selector 正文剔除html标签 html_replacer html换行标签 html_replacer_for_no_tag_line 无标签文字是否按照同级换行标签换行 0 不处理 1 换行处理 params = doc:doc,content_selector:content_selector,content_rid_html_selector:content_rid_html_selector,html_replacer:html_replacer,html_replacer_for_no_tag_line:html_replacer_for_no_tag_line,content_replacer:content_replacer,content_filter:content_filter 示例用法 doc = Nokogiri::HTML(res.body) content_selector = “div.content” html_replacer = “p” params = doc:doc,content_selector:content_selector,html_replacer:html_replacer desp,html_content = Htmlarticle.get_html_content(params)
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/models/htmlarticle.rb', line 28 def self.get_html_content(params) desp_buff,html_content,desp = "","","" doc = params[:doc] content_selector = params[:content_selector].to_s.split("||||") html_replacer = params[:html_replacer].to_s.split("||||") html_replacer_for_no_tag_line = params[:html_replacer_for_no_tag_line] content_rid_html_selector = params[:content_rid_html_selector].to_s.split("||||") content_selector.each do |v| doc_content = doc.clone html_content = "" doc_content.search(v).each do |s| # 剔除不需要的节点 content_rid_html_selector.each do |rid| s.search(rid).remove end # 处理html_content html_content += s.to_s if s.present? end # 处理 desp doc_content.search(v).each do |s| if html_replacer.count > 0 && html_replacer[0].present? if html_replacer.include? s.name desp_buff += "\n" end end s.children.each do |n| desp_buff = get_desp(n,desp_buff,html_replacer,html_replacer_for_no_tag_line) end end # 处理空格和换行 # desp_buff = desp_buff.gsub("\n","").strip break if html_content.present? && desp_buff.present? end filters = params[:content_filter].to_s.split("||||") filters.each do |filter| if desp_buff.include? filter desp_buff = "" html_content = "" break end end content_replacer = params[:content_replacer].to_s if content_replacer.present? content_replacer.split("||||").each do |replacer| desp_buff = desp_buff.gsub(replacer,"") if replacer.present? if replacer.present? replacer_arr = replacer.split("&&&&") desp_buff = desp_buff.gsub(replacer_arr[0],"") html_content = html_content.gsub(replacer_arr[1],"") if replacer_arr[1].present? end end end desp = "" desp_buff.split("\n").each do |v| desp += v.strip + "\n" if v.strip.present? end return desp,html_content end |
Instance Method Details
#content ⇒ Object
9 10 11 |
# File 'lib/models/htmlarticle.rb', line 9 def content @content end |
#parse ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
# File 'lib/models/htmlarticle.rb', line 127 def parse text = @text.to_s.gsub(/(?imx)<!--.*?-->/,"").gsub(/(?imx)<script.+?script>/,"").gsub(/(?imx)<style.+?style>/,"").gsub(/<\/a>/,"</a>\n") preTextLen = 0 startPos = -1 _depth = 6 _limitCount = 180 _headEmptyLines = 2 _endLimitCharCount = 20 if text.split("\n").count < 10 text = text.gsub(">",">\n") end #puts text body = text.match(/(?imx)<body.+?<\/body>/).to_s #body = body.gsub(/(?imx)(<[^<>]+\n.+?>)/,"\1") # body.scan(/(?imx)(<[^<>]+\n.+?>)/).each_with_index do |n,i| # puts "-----#{i}" # puts n # x = n.to_s.gsub("\n","") # body = body.gsub(/(?imx)#{n}/,x) # end orgLines = body.gsub(/(?imx)<\/p>|<br.+?\/>/,"[crlf]").gsub(/(?imx)<(\S*?)[^>]*>.*?|<.*? \/>/,"").split("\n") lines = [] @content = "" orgLines.each do |line| lines << line.strip end #puts lines.join("\n") for i in 0..(lines.count-_depth-1) len = 0 for j in 0..(_depth-1) len += lines[i+j].size end if startPos == -1 if preTextLen > _limitCount && len > 0 emptyCount = 0 k = i - 1 k.downto 1 do |z| if lines[z].to_s == "" emptyCount+=1 else emptyCount = 0 if emptyCount == _headEmptyLines startPos = z + _headEmptyLines break end end end if startPos == -1 startPos = i end for j in startPos..i @content+= lines[j] end end else if len <= _endLimitCharCount && preTextLen < _endLimitCharCount #break startPos = -1 end @content+= lines[i] end preTextLen = len end @content = @content.gsub("[crlf]","\n") @content end |
#title ⇒ Object
123 124 125 |
# File 'lib/models/htmlarticle.rb', line 123 def title end |