Class: InstagramCrawler::GetHashTagUrl

Inherits:
Object
  • Object
show all
Defined in:
lib/instagram_crawler.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(hashtag) ⇒ GetHashTagUrl

Returns a new instance of GetHashTagUrl.



22
23
24
25
26
# File 'lib/instagram_crawler.rb', line 22

def initialize(hashtag)
  @logger = Logger.new(STDOUT)
  @hashtag = hashtag
  @images = []
end

Instance Attribute Details

#imagesObject

Returns the value of attribute images.



21
22
23
# File 'lib/instagram_crawler.rb', line 21

def images
  @images
end

Instance Method Details

#download_hashtag(limit) ⇒ Object



36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/instagram_crawler.rb', line 36

def download_hashtag(limit)
  next_page = nil
  number = 0
  loop do
    get_number, next_page = download_hashtag_page(@hashtag, next_page)
    number += get_number
    info "getting #{number} ##{@hashtag} images...."
    break if next_page.nil? || number > limit
  end
  @images = @images.take(limit) if @images.count > limit
  return @images
end

#download_hashtag_page(hashtag, page) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/instagram_crawler.rb', line 49

def download_hashtag_page(hashtag, page)
  json_result = download_per_page(hashtag, page)
  return [0, nil] if json_result.nil?
  hashtag_info = json_result['graphql']['hashtag']
  size = 0
  %w(media top_posts).each do |part|
    next if part == 'top_posts' && page
    edges = hashtag_info["edge_hashtag_to_#{part}"]['edges']
    edges.each do |edge|
      hashtag_format = format_edge(edge)
      @images << hashtag_format
    end
    next if edges.empty?
    size += edges.size
  end
  if hashtag_info['edge_hashtag_to_media']['page_info']['has_next_page']
    [size, hashtag_info['edge_hashtag_to_media']['page_info']['end_cursor']]
  else
    [size, nil]
  end
end

#download_per_page(hashtag, page, use_cookie = false) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/instagram_crawler.rb', line 89

def download_per_page(hashtag, page, use_cookie = false)
  download_uri = hashtag_page_url(hashtag, page)
  return if download_uri.nil?
  result = http_access(download_uri, use_cookie)
  JSON.parse(result)
rescue JSON::ParserError => e
  error "JSON parsing failed for URI: #{download_uri}, not retrying"
  return nil
rescue => e
  error "Download failed for URI: #{download_uri} retrying ..."
  sleep 1
  retry
end

#error(message) ⇒ Object



32
33
34
# File 'lib/instagram_crawler.rb', line 32

def error(message)
  @logger.error(message)
end

#format_edge(edge) ⇒ Object

retuen result_hash



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/instagram_crawler.rb', line 72

def format_edge(edge)
  return if edge.nil?
  {
    id: edge["node"]["id"].to_i,
    text: edge["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"],
    shortcode: edge["node"]["shortcode"],
    dimensions: edge["node"]["dimensions"],
    image_url: edge["node"]["display_url"],
    owner: edge["node"]["owner"],
    thumbnail_url: edge["node"]["thumbnail_src"],
    hashtag: edge["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"].scan(%r|\s?(#[^\s ]+)\s?|).flatten,
    thumnail_images: edge["node"]["thumbnail_resources"]
  }
rescue => e
  return nil
end

#hashtag_page_url(hashtag, page) ⇒ Object



137
138
139
140
141
142
143
144
145
# File 'lib/instagram_crawler.rb', line 137

def hashtag_page_url(hashtag, page)
  if page
    URI.parse URI.encode "https://www.instagram.com/explore/tags/#{hashtag}/?__a=1&max_id=#{page}"
  else
    URI.parse URI.encode "https://www.instagram.com/explore/tags/#{hashtag}/?__a=1"
  end
rescue URI::InvalidURIError
  error "Invalid hashtag #{hashtag} .."
end

#http_access(uri, use_cookie = false, try = 3, first = true) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# File 'lib/instagram_crawler.rb', line 103

def http_access(uri, use_cookie = false, try = 3, first = true)
  response = if use_cookie
               http = Net::HTTP.new(uri.hostname, uri.port)
               http.use_ssl = true
               http.get(uri, 'Cookie' => false)
             else
               Net::HTTP.get_response uri
             end
  if response.code.to_i >= 399
    if try > 0
      warn "Downloading #{uri} ended in #{response.code} Retrying #{try} times."
      if response.code == 429
        sleep 5
        return http_access(uri, use_cookie, 10, false) if first
      else
        sleep 1
      end
      http_access(uri, use_cookie, try -1 , first)
    else
      response.body
    end
  else
    response.body
  end
rescue => e
  if try > 0
    warn "Downlading #{uri} ended in #{e}. Retring #{try} times"
    sleep 1
    http_access(uri, use_cookie, try -1, first)
  else
    ""
  end
end

#info(message) ⇒ Object



28
29
30
# File 'lib/instagram_crawler.rb', line 28

def info(message)
  @logger.info(message)
end