Class: Birdsong::TweetScraper
- Defined in:
- lib/birdsong/scrapers/tweet_scraper.rb
Instance Method Summary collapse
Methods inherited from Scraper
#get_content_of_subpage_from_url, #initialize
Constructor Details
This class inherits a constructor from Birdsong::Scraper
Instance Method Details
#parse(id) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/birdsong/scrapers/tweet_scraper.rb', line 9 def parse(id) # Stuff we need to get from the DOM (implemented is starred): # - User * # - Text * # - Image * / Images * / Video * # - Date * # - Number of likes * # - Hashtags Capybara.app_host = "https://x.com" # video slideshows https://www.instagram.com/p/CY7KxwYOFBS/?utm_source=ig_embed&utm_campaign=loading # login graphql_object = get_content_of_subpage_from_url( "https://x.com/jack/status/#{id}", "/graphql", "data,tweetResult,result" ) graphql_object = graphql_object.first if graphql_object.kind_of?(Array) graphql_object = graphql_object["data"]["tweetResult"]["result"] if graphql_object.key?("__typename") && graphql_object["__typename"] == "TweetUnavailable" raise Birdsong::NoTweetFoundError end # Certain types of tweets are wrapped in a "tweet" object graphql_object = graphql_object["tweet"] if graphql_object.key?("tweet") text = graphql_object["legacy"]["full_text"] date = graphql_object["legacy"]["created_at"] id = graphql_object["legacy"]["id_str"] number_of_likes = graphql_object["legacy"]["favorite_count"] language = graphql_object["legacy"]["lang"] images = [] videos = [] video_preview_image = nil video_file_type = nil if graphql_object["legacy"]["entities"].key?("media") graphql_object["legacy"]["entities"]["media"].each do |media| case media["type"] when "photo" images << Birdsong.retrieve_media(media["media_url_https"]) when "video" video_preview_image = Birdsong.retrieve_media(media["media_url_https"]) video_variants = media["video_info"]["variants"] largest_bitrate_variant = video_variants.sort_by do |variant| variant["bitrate"].nil? ? 0 : variant["bitrate"] end.last videos << Birdsong.retrieve_media(largest_bitrate_variant["url"]) video_file_type = "video" when "animated_gif" video_preview_image = Birdsong.retrieve_media(media["media_url_https"]) videos << media["video_info"]["variants"].first["url"] video_file_type = "animated_gif" end end end screenshot_file = take_screenshot() # This has to run last since it switches pages user_object = graphql_object["core"]["user_results"]["result"] user = { id: user_object["id"], name: user_object["legacy"]["name"], username: user_object["legacy"]["screen_name"], sign_up_date: user_object["legacy"]["created_at"], location: user_object["legacy"]["location"], profile_image_url: user_object["legacy"]["profile_image_url_https"], description: user_object["legacy"]["description"], followers_count: user_object["legacy"]["followers_count"], following_count: user_object["legacy"]["friends_count"], tweet_count: user_object["legacy"]["statuses_count"], listed_count: user_object["legacy"]["listed_count"], verified: user_object["legacy"]["verified"], url: user_object["legacy"]["url"], profile_image_file_name: Birdsong.retrieve_media(user_object["legacy"]["profile_image_url_https"]) } page.quit { images: images, video: videos, video_preview_image: video_preview_image, screenshot_file: screenshot_file, text: text, date: date, number_of_likes: number_of_likes, user: user, id: id, language: language, video_file_type: video_file_type } end |
#take_screenshot ⇒ Object
109 110 111 112 113 114 115 116 |
# File 'lib/birdsong/scrapers/tweet_scraper.rb', line 109 def take_screenshot # First check if a post has a fact check overlay, if so, clear it. # The only issue is that this can take *awhile* to search. Not sure what to do about that # since it's Instagram's fault for having such a fucked up obfuscated hierarchy # Take the screenshot and return it # rubocop:disable Lint/Debugger save_screenshot("#{Birdsong.temp_storage_location}/instagram_screenshot_#{SecureRandom.uuid}.png") # rubocop:enable Link/Debugger end |