71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
# File 'lib/twitterscraper/tweet.rb', line 71
def (html)
screen_name = html.attr('data-screen-name')
= html.attr('data-tweet-id')&.to_i
unless html.to_s.include?('js-tweet-text-container')
Twitterscraper.logger.warn "html doesn't include div.js-tweet-text-container url=https://twitter.com/#{screen_name}/status/#{}"
return nil
end
inner_html = Nokogiri::HTML(html.inner_html)
profile_image_url = inner_html.xpath("//img[@class[contains(., 'js-action-profile-avatar')]]").first.attr('src').gsub(/_bigger/, '')
text = inner_html.xpath("//div[@class[contains(., 'js-tweet-text-container')]]/p[@class[contains(., 'js-tweet-text')]]").first.text
links = inner_html.xpath("//a[@class[contains(., 'twitter-timeline-link')]]").map { |elem| elem.attr('data-expanded-url') }.select { |link| link && !link.include?('pic.twitter') }
image_urls = inner_html.xpath("//div[@class[contains(., 'AdaptiveMedia-photoContainer')]]").map { |elem| elem.attr('data-image-url') }
video_url = inner_html.xpath("//div[@class[contains(., 'PlayableMedia-container')]]/a").map { |elem| elem.attr('href') }[0]
has_media = !image_urls.empty? || (video_url && !video_url.empty?)
actions = inner_html.xpath("//div[@class[contains(., 'ProfileTweet-actionCountList')]]")
likes = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--favorite')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
= actions.xpath("//span[@class[contains(., 'ProfileTweet-action--retweet')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
replies = actions.xpath("//span[@class[contains(., 'ProfileTweet-action--reply u-hiddenVisually')]]/span[@class[contains(., 'ProfileTweet-actionCount')]]").first.attr('data-tweet-stat-count').to_i || 0
is_replied = replies != 0
= inner_html.xpath('//*[@data-conversation-id]').first.attr('data-conversation-id').to_i
if ==
is_reply_to = false
= nil
reply_to_users = []
else
is_reply_to = true
reply_to_users = inner_html.xpath("//div[@class[contains(., 'ReplyingToContextBelowAuthor')]]/a").map { |user| {screen_name: user.text.delete_prefix('@'), user_id: user.attr('data-user-id')} }
end
timestamp = inner_html.xpath("//span[@class[contains(., 'js-short-timestamp')]]").first.attr('data-time').to_i
new(
screen_name: screen_name,
name: html.attr('data-name'),
user_id: html.attr('data-user-id').to_i,
profile_image_url: profile_image_url,
tweet_id: ,
text: text,
links: links,
hashtags: text.scan(/#\w+/).map { |tag| tag.delete_prefix('#') },
image_urls: image_urls,
video_url: video_url,
has_media: has_media,
likes: likes,
retweets: ,
replies: replies,
is_replied: is_replied,
is_reply_to: is_reply_to,
parent_tweet_id: ,
reply_to_users: reply_to_users,
tweet_url: 'https://twitter.com' + html.attr('data-permalink-path'),
timestamp: timestamp,
created_at: Time.at(timestamp, in: '+00:00'),
)
end
|