Class: Htmlarticle

Inherits:
Object
  • Object
show all
Defined in:
lib/models/htmlarticle.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(text, options = {}) ⇒ Htmlarticle

Returns a new instance of Htmlarticle.



3
4
5
6
7
# File 'lib/models/htmlarticle.rb', line 3

def initialize(text,options = {})
  @text = text
  @options = options
  @content = ""
end

Class Method Details

.get_desp(n, desp_buff, html_replacer, html_replacer_for_no_tag_line) ⇒ Object



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# File 'lib/models/htmlarticle.rb', line 93

def self.get_desp(n,desp_buff,html_replacer,html_replacer_for_no_tag_line)
  html_replacer = html_replacer
  if html_replacer.count > 0 && html_replacer[0].present?
    if html_replacer.include? n.name
      desp_buff += "\n"
    end
  end
  if n.name == "text"
    if html_replacer_for_no_tag_line == 1 && (html_replacer.count > 0 && html_replacer[0].present?)
      if n.parent.first_element_child != n
        if html_replacer.include? n.previous_sibling.try(:name)
          desp_buff += "\n"
        end
      end
    end
    desp_buff += n.inner_text.gsub("\n"," ") if n.inner_text.present?
    if !(html_replacer.count > 0 && html_replacer[0].present?)
      desp_buff += "\n"
    end
  end
  if n.children.present?
    n.children.each do |c|
      desp_buff = get_desp(c,desp_buff,html_replacer,html_replacer_for_no_tag_line)
    end
  end
  return desp_buff
end

.get_html_content(params) ⇒ Object

参数说明 doc 源代码 必填参数 content_selector 正文规则 必填参数 content_replacer 正文替换正则 content_filter 正文过滤 content_rid_html_selector 正文剔除html标签 html_replacer html换行标签 html_replacer_for_no_tag_line 无标签文字是否按照同级换行标签换行 0 不处理 1 换行处理 params = doc:doc,content_selector:content_selector,content_rid_html_selector:content_rid_html_selector,html_replacer:html_replacer,html_replacer_for_no_tag_line:html_replacer_for_no_tag_line,content_replacer:content_replacer,content_filter:content_filter 示例用法 doc = Nokogiri::HTML(res.body) content_selector = “div.content” html_replacer = “p” params = doc:doc,content_selector:content_selector,html_replacer:html_replacer desp,html_content = Htmlarticle.get_html_content(params)



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/models/htmlarticle.rb', line 28

def self.get_html_content(params)
  desp_buff,html_content,desp = "","",""
  doc = params[:doc]
  content_selector = params[:content_selector].to_s.split("||||")
  html_replacer = params[:html_replacer].to_s.split("||||")
  html_replacer_for_no_tag_line = params[:html_replacer_for_no_tag_line]
  content_rid_html_selector = params[:content_rid_html_selector].to_s.split("||||")
  content_selector.each do |v|
    doc_content = doc.clone
    html_content = ""
    doc_content.search(v).each do |s|
      # 剔除不需要的节点
      content_rid_html_selector.each do |rid|
        s.search(rid).remove
      end
      # 处理html_content
      html_content += s.to_s if s.present?
    end
    # 处理 desp
    doc_content.search(v).each do |s|
      if html_replacer.count > 0 && html_replacer[0].present?
        if html_replacer.include? s.name
          desp_buff += "\n"
        end
      end
      s.children.each do |n|
        desp_buff = get_desp(n,desp_buff,html_replacer,html_replacer_for_no_tag_line)
      end
    end
    # 处理空格和换行
    # desp_buff = desp_buff.gsub("\n","").strip
    break if html_content.present? && desp_buff.present?
  end

  filters = params[:content_filter].to_s.split("||||")
  filters.each do |filter|
    if desp_buff.include? filter
      desp_buff = ""
      html_content = ""
      break
    end
  end

  content_replacer = params[:content_replacer].to_s
  if content_replacer.present?
    content_replacer.split("||||").each do |replacer|
      desp_buff = desp_buff.gsub(replacer,"") if replacer.present?
      if replacer.present?
        replacer_arr = replacer.split("&&&&")
        desp_buff = desp_buff.gsub(replacer_arr[0],"")
        html_content = html_content.gsub(replacer_arr[1],"") if replacer_arr[1].present?
      end
    end
  end



  desp = ""
  desp_buff.split("\n").each do |v|
    desp += v.strip + "\n" if v.strip.present?
  end

  return desp,html_content
end

Instance Method Details

#contentObject



9
10
11
# File 'lib/models/htmlarticle.rb', line 9

def content
  @content
end

#parseObject



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# File 'lib/models/htmlarticle.rb', line 127

def parse
  text = @text.to_s.gsub(/(?imx)<!--.*?-->/,"").gsub(/(?imx)<script.+?script>/,"").gsub(/(?imx)<style.+?style>/,"").gsub(/<\/a>/,"</a>\n")
  preTextLen = 0
  startPos = -1
  _depth = 6
  _limitCount = 180
  _headEmptyLines = 2
  _endLimitCharCount = 20

  if text.split("\n").count < 10
    text = text.gsub(">",">\n")
  end
  #puts text
  body = text.match(/(?imx)<body.+?<\/body>/).to_s

  #body = body.gsub(/(?imx)(<[^<>]+\n.+?>)/,"\1")

  # body.scan(/(?imx)(<[^<>]+\n.+?>)/).each_with_index do |n,i|
  #   puts "-----#{i}"
  #   puts n
  #   x = n.to_s.gsub("\n","")
  #   body = body.gsub(/(?imx)#{n}/,x)
  # end

  orgLines = body.gsub(/(?imx)<\/p>|<br.+?\/>/,"[crlf]").gsub(/(?imx)<(\S*?)[^>]*>.*?|<.*? \/>/,"").split("\n")
  lines = []

  @content = ""

  orgLines.each do |line|
    lines << line.strip
  end

  #puts lines.join("\n")

  for i in 0..(lines.count-_depth-1)
    len = 0
    for j in 0..(_depth-1)
      len += lines[i+j].size
    end

    if startPos == -1
      if preTextLen > _limitCount && len > 0
        emptyCount = 0
        k = i - 1
        k.downto 1 do |z|
          if lines[z].to_s == ""
            emptyCount+=1
          else
            emptyCount = 0
            if emptyCount == _headEmptyLines
              startPos = z + _headEmptyLines
              break
            end
          end
        end
        if startPos == -1
          startPos = i
        end
        for j in startPos..i
          @content+= lines[j]
        end
      end
    else
      if len <= _endLimitCharCount && preTextLen < _endLimitCharCount
        #break
        startPos = -1
      end
      @content+= lines[i]
    end
    preTextLen = len
  end

  @content = @content.gsub("[crlf]","\n")

  @content
end

#titleObject



123
124
125
# File 'lib/models/htmlarticle.rb', line 123

def title

end