Class: TaiwaneseNewsParser::Parser::ChinaTimes
Instance Attribute Summary
#article, #url
Class Method Summary
collapse
Instance Method Summary
collapse
applicable_parser, #clean_up, #initialize, #reproduced?, subclasses
Class Method Details
.applicable?(url) ⇒ Boolean
10
11
12
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 10
def self.applicable?(url)
url.include?('chinatimes.com') && !url.include?('money.chinatimes.com')
end
|
.domain ⇒ Object
2
3
4
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 2
def self.domain
'chinatimes.com'
end
|
.names ⇒ Object
6
7
8
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 6
def self.names
%w{中國時報 中時電子報 工商時報 旺報 時報週刊 中天 中視 中廣 中時即時}
end
|
.parse_url_id(url) ⇒ Object
74
75
76
77
78
79
80
81
82
83
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 74
def self.parse_url_id(url)
url_id = url[%r{http://news\.chinatimes\.com/\w+/(\d+/\d+)},1]
if url_id.nil?
url_id = url[%r{[^-]*+[^-]*+-(\d+)-\d+},1]
end
if url_id.nil?
url_id = url[%r{chinatimes\.com/(.+)},1]
end
url_id
end
|
Instance Method Details
#clean_url ⇒ Object
69
70
71
72
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 69
def clean_url
cleaner = TaiwaneseNewsParser::UrlCleaner.new('id')
@article[:url] = cleaner.clean(@article[:url])
end
|
#doc ⇒ Object
14
15
16
17
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 14
def doc
@raw = open(url).read
@doc = Nokogiri::HTML(@raw)
end
|
#parse ⇒ Object
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 21
def parse
@article[:title] = doc.at_css('.page_container header h1').text
@article[:company_name] = parse_company_name
@article[:content] = doc.css('.page_container article>p').text
@article[:reporter_name] = parse_reporter_name()
t = doc.css('.reporter time').text.match(/(\d*)年(\d*)月(\d*)日 (\d*):(\d*)/)
@article[:published_at] = Time.new(t[1],t[2],t[3],t[4],t[5])
clean_up
@article
end
|
#parse_company_name ⇒ Object
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 55
def parse_company_name
if doc.at_css('.reporter>a').nil?
return '中時電子報'
end
n = doc.at_css('.reporter>a').text
if n == '時週精選'
n = '時報週刊'
elsif n == '新聞速報'
n = '中時電子報'
end
n
end
|
#parse_reporter_name ⇒ Object
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
# File 'lib/taiwanese_news_parser/parser/china_times.rb', line 40
def parse_reporter_name
el = doc.at_css('.reporter a[rel=author]')
return el.text if el
text = doc.css('.reporter>text()').text
if match = text.match(%r{記者(.+?)[//╱/]})
reporter_name = match[1]
elsif match = text.match(%r{【(.+?)[//╱/]})
reporter_name = match[1]
else
reporter_name = text
end
reporter_name
end
|