Class: Birdsong::TweetScraper

Inherits:
Scraper
  • Object
show all
Defined in:
lib/birdsong/scrapers/tweet_scraper.rb

Instance Method Summary collapse

Methods inherited from Scraper

#get_content_of_subpage_from_url, #initialize

Constructor Details

This class inherits a constructor from Birdsong::Scraper

Instance Method Details

#parse(id) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# File 'lib/birdsong/scrapers/tweet_scraper.rb', line 9

def parse(id)
  # Stuff we need to get from the DOM (implemented is starred):
  # - User *
  # - Text *
  # - Image * / Images * / Video *
  # - Date *
  # - Number of likes *
  # - Hashtags

  Capybara.app_host = "https://x.com"

  # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading
  # login
  graphql_object = get_content_of_subpage_from_url(
    "https://x.com/jack/status/#{id}",
    "/graphql",
    "data,tweetResult,result"
  )

  graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
  graphql_object = graphql_object["data"]["tweetResult"]["result"]

  if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable"
    raise Birdsong::NoTweetFoundError
  end

  # Certain types of tweets are wrapped in a "tweet" object
  graphql_object = graphql_object["tweet"] if graphql_object.key?("tweet")

  text = graphql_object["legacy"]["full_text"]
  date = graphql_object["legacy"]["created_at"]
  id   = graphql_object["legacy"]["id_str"]
  number_of_likes = graphql_object["legacy"]["favorite_count"]
  language = graphql_object["legacy"]["lang"]

  images = []
  videos = []
  video_preview_image = nil
  video_file_type = nil

  if graphql_object["legacy"]["entities"].key?("media")
    graphql_object["legacy"]["entities"]["media"].each do |media|
      case media["type"]
      when "photo"
        images << Birdsong.retrieve_media(media["media_url_https"])
      when "video"
        video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
        video_variants = media["video_info"]["variants"]
        largest_bitrate_variant = video_variants.sort_by do |variant|
          variant["bitrate"].nil? ? 0 : variant["bitrate"]
        end.last

        videos << Birdsong.retrieve_media(largest_bitrate_variant["url"])
        video_file_type = "video"
      when "animated_gif"
        video_preview_image = Birdsong.retrieve_media(media["media_url_https"])
        videos << media["video_info"]["variants"].first["url"]
        video_file_type = "animated_gif"
      end
    end
  end

  screenshot_file = take_screenshot()

  # This has to run last since it switches pages
  user_object = graphql_object["core"]["user_results"]["result"]
  user = {
    id: user_object["id"],
    name: user_object["legacy"]["name"],
    username: user_object["legacy"]["screen_name"],
    sign_up_date: user_object["legacy"]["created_at"],
    location: user_object["legacy"]["location"],
    profile_image_url: user_object["legacy"]["profile_image_url_https"],
    description: user_object["legacy"]["description"],
    followers_count: user_object["legacy"]["followers_count"],
    following_count: user_object["legacy"]["friends_count"],
    tweet_count: user_object["legacy"]["statuses_count"],
    listed_count: user_object["legacy"]["listed_count"],
    verified: user_object["legacy"]["verified"],
    url: user_object["legacy"]["url"],
    profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"])
  }

  page.quit

  {
    images: images,
    video: videos,
    video_preview_image: video_preview_image,
    screenshot_file: screenshot_file,
    text: text,
    date: date,
    number_of_likes: number_of_likes,
    user: user,
    id: id,
    language: language,
    video_file_type: video_file_type
  }
end

#take_screenshotObject



109
110
111
112
113
114
115
116
# File 'lib/birdsong/scrapers/tweet_scraper.rb', line 109

def take_screenshot
  # First check if a post has a fact check overlay, if so, clear it.
  # The only issue is that this can take *awhile* to search. Not sure what to do about that
  # since it's Instagram's fault for having such a fucked up obfuscated hierarchy      # Take the screenshot and return it
  # rubocop:disable Lint/Debugger
  save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png")
  # rubocop:enable Link/Debugger
end