Class: TwitterToCsv::CsvBuilder

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_to_csv/csv_builder.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ CsvBuilder

Returns a new instance of CsvBuilder.



10
11
12
13
14
15
16
# File 'lib/twitter_to_csv/csv_builder.rb', line 10

def initialize(options = {})
  @options = options
  @sampled_fields = {}
  @num_samples = 0
  @retweet_counts = {}
  @retweet_hour_counts = {}
end

Instance Attribute Details

#optionsObject

Returns the value of attribute options.



8
9
10
# File 'lib/twitter_to_csv/csv_builder.rb', line 8

def options
  @options
end

#sampled_fieldsObject

Returns the value of attribute sampled_fields.



8
9
10
# File 'lib/twitter_to_csv/csv_builder.rb', line 8

def sampled_fields
  @sampled_fields
end

Instance Method Details

#afinnObject



185
186
187
188
189
190
191
192
193
194
195
# File 'lib/twitter_to_csv/csv_builder.rb', line 185

def afinn
  @afinn_cache ||= begin
    words_or_phrases = []
    File.read(File.expand_path(File.join(File.dirname(__FILE__), "afinn", "AFINN-111.txt"))).each_line do |line|
      word_or_phrase, valence = line.split(/\t/)
      pattern = Regexp::escape word_or_phrase.gsub(/-/, " ").gsub(/'/, '')
      words_or_phrases << [/\b#{pattern}\b/i, pattern.length, valence.to_f]
    end
    words_or_phrases.sort {|b, a| a[1] <=> b[1] }
  end
end

#analyze_gaps(status, min_gap_size_in_minutes) ⇒ Object



228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/twitter_to_csv/csv_builder.rb', line 228

def analyze_gaps(status, min_gap_size_in_minutes)
  time = Time.parse(status['created_at'])
  if !@last_status_seen_at
    puts "First status seen at #{time}."
  else
    gap_length = (time - @last_status_seen_at) / 60
    if gap_length > min_gap_size_in_minutes
      puts "Gap of #{gap_length.to_i} minutes from #{@last_status_seen_at} to #{time}."
    end
  end
  @last_status_seen_at = time
end

#compute_sentiment(original_text) ⇒ Object



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/twitter_to_csv/csv_builder.rb', line 197

def compute_sentiment(original_text)
  text = original_text.downcase.gsub(/'/, '').gsub(/[^a-z0-9]/, ' ').gsub(/\s+/, ' ').strip
  count = 0
  valence_sum = 0
  afinn.each do |pattern, length, valence|
    while text =~ pattern
      text.sub! pattern, ''
      valence_sum += valence
      count += 1
    end
  end
  if count > 0
    [valence_sum / count.to_f, count]
  else
    [0, 0]
  end
end

#display_rolledup_status?(status) ⇒ Boolean

Returns:

  • (Boolean)


48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/twitter_to_csv/csv_builder.rb', line 48

def display_rolledup_status?(status)
  created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
  @newest_status_at = created_at if @newest_status_at.nil?

  if status['retweeted_status'] && status['retweeted_status']['id']
    # This is a retweet.
    original_created_at = status['retweeted_status']['created_at'].is_a?(Time) ? status['retweeted_status']['created_at'] : Time.parse(status['retweeted_status']['created_at'])
    if !options[:retweet_window] || created_at <= original_created_at + options[:retweet_window] * 60 * 60 * 24
      @retweet_counts[status['retweeted_status']['id']] ||= 0
      @retweet_counts[status['retweeted_status']['id']] = status['retweeted_status']['retweet_count'] if status['retweeted_status']['retweet_count'] > @retweet_counts[status['retweeted_status']['id']]

      if options[:retweet_counts_at]
        @retweet_hour_counts[status['retweeted_status']['id']] ||= options[:retweet_counts_at].map { 0 }
        options[:retweet_counts_at].each.with_index do |hour_mark, index|
          if created_at <= original_created_at + hour_mark * 60 * 60 && status['retweeted_status']['retweet_count'] > @retweet_hour_counts[status['retweeted_status']['id']][index]
            @retweet_hour_counts[status['retweeted_status']['id']][index] = status['retweeted_status']['retweet_count']
          end
        end
      end
    end
    false
  else
    # This is an original status.
    if (@retweet_counts[status['id']] || 0) >= (options[:retweet_threshold] || 0)
      if !options[:retweet_window] || created_at <= @newest_status_at - options[:retweet_window] * 60 * 60 * 24
        status['retweet_count'] = @retweet_counts[status['id']] if @retweet_counts[status['id']] && @retweet_counts[status['id']] > status['retweet_count']
        if options[:retweet_counts_at]
          retweet_hour_data = @retweet_hour_counts.delete(status['id'])
          if !retweet_hour_data
            puts "Encountered missing retweet_data for tweet##{status['id']}, possibly due to a repeating id or a deleted tweet."
            return false
          end
          status['_retweet_hour_counts'] = retweet_hour_data
        end
        true
      else
        false
      end
    else
      false
    end
  end
end

#extract_fields(object, fields, current_path = []) ⇒ Object



253
254
255
256
257
258
259
260
261
262
263
# File 'lib/twitter_to_csv/csv_builder.rb', line 253

def extract_fields(object, fields, current_path = [])
  if object.is_a?(Hash)
    object.each do |k, v|
      extract_fields v, fields, current_path + [k]
    end
  else
    path = current_path.join(".")
    fields[path] ||= 0
    fields[path] += 1
  end
end

#handle_status(status, &block) ⇒ Object



92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/twitter_to_csv/csv_builder.rb', line 92

def handle_status(status, &block)
  if within_time_window?(status)
    if (options[:require_english] && is_english?(status)) || !options[:require_english]
      if options[:retweet_mode] != :rollup || display_rolledup_status?(status)
        log_json(status) if options[:json]
        log_csv(status) if options[:csv]
        yield_status(status, &block) if block
        sample_fields(status) if options[:sample_fields]
        analyze_gaps(status, options[:analyze_gaps]) if options[:analyze_gaps]
        STDERR.puts "Logging: #{status['text']}" if options[:verbose]
      end
    end
  end
end

#is_english?(status) ⇒ Boolean

Returns:

  • (Boolean)


271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# File 'lib/twitter_to_csv/csv_builder.rb', line 271

def is_english?(status)
  if status.has_key?('delete')
    STDERR.puts "Skipping Tweet with delete." if options[:verbose]
    return false
  end

  #unless status['user']['lang'] == "en"
  #  STDERR.puts "Skipping \"#{status['text']}\" due to lang of #{status['user']['lang']}." if options[:verbose]
  #  return false
  #end

  unless UnsupervisedLanguageDetection.is_english_tweet?(status['text'])
    STDERR.puts "Skipping \"#{status['text']}\" due to UnsupervisedLanguageDetection guessing non-English" if options[:verbose]
    return false
  end

  true
end

#log_csv(status) ⇒ Object



130
131
132
# File 'lib/twitter_to_csv/csv_builder.rb', line 130

def log_csv(status)
  options[:csv].puts output_row(status).to_csv(:encoding => 'UTF-8', :force_quotes => true)
end

#log_csv_headerObject



107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/twitter_to_csv/csv_builder.rb', line 107

def log_csv_header
  header_labels = options[:fields].dup

  header_labels += ["average_sentiment", "sentiment_words"] if options[:compute_sentiment]
  header_labels << "word_count" if options[:compute_word_count]

  header_labels << "normalized_source" if options[:normalize_source]

  (options[:date_fields] || []).each do |date_field|
    %w[week_day day month year hour minute second].each do |value|
      header_labels << "#{date_field}_#{value}"
    end
  end

  options[:retweet_counts_at].each { |hours| header_labels << "retweets_at_#{hours}_hours" } if options[:retweet_counts_at]

  options[:url_columns].times { |i| header_labels << "url_#{i+1}" } if options[:url_columns] && options[:url_columns] > 0
  options[:hashtag_columns].times { |i| header_labels << "hash_tag_#{i+1}" } if options[:hashtag_columns] && options[:url_columns] > 0
  options[:user_mention_columns].times { |i| header_labels << "user_mention_#{i+1}" } if options[:user_mention_columns] && options[:user_mention_columns] > 0

  options[:csv].puts header_labels.to_csv(:encoding => 'UTF-8', :force_quotes => true)
end

#log_json(status) ⇒ Object



265
266
267
268
269
# File 'lib/twitter_to_csv/csv_builder.rb', line 265

def log_json(status)
  options[:json].puts JSON.dump(status) #JSON.pretty_generate(status)
  options[:json].puts "------SEPARATOR------"
  options[:json].flush
end

#output_row(status) ⇒ Object



138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/twitter_to_csv/csv_builder.rb', line 138

def output_row(status)
  row = options[:fields].map do |field|
    field.split(".").inject(status) { |memo, segment|
      memo && memo[segment]
    }.to_s
  end

  row += compute_sentiment(status["text"]) if options[:compute_sentiment]

  row << status["text"].split(/\s+/).length if options[:compute_word_count]

  row << status["source"].gsub(/<[^>]+>/, '').strip if options[:normalize_source]

  (options[:date_fields] || []).each do |date_field|
    time = Time.parse(date_field.split(".").inject(status) { |memo, segment|
      memo && memo[segment]
    }.to_s)

    row << time.strftime("%w") # week_day
    row << time.strftime("%-d") # day
    row << time.strftime("%-m") # month
    row << time.strftime("%Y") # year
    row << time.strftime("%-H") # hour
    row << time.strftime("%M") # minute
    row << time.strftime("%S") # second
  end

  row += status["_retweet_hour_counts"] if options[:retweet_counts_at]

  if options[:url_columns] && options[:url_columns] > 0
    urls = (status["entities"] && (status["entities"]["urls"] || []).map {|i| i["expanded_url"] || i["url"] }) || []
    options[:url_columns].times { |i| row << urls[i].to_s }
  end

  if options[:hashtag_columns] && options[:hashtag_columns] > 0
    hashes = (status["entities"] && (status["entities"]["hashtags"] || []).map {|i| i["text"] }) || []
    options[:hashtag_columns].times { |i| row << hashes[i].to_s }
  end

  if options[:user_mention_columns] && options[:user_mention_columns] > 0
    users = (status["entities"] && (status["entities"]["user_mentions"] || []).map {|i| i["screen_name"] }) || []
    options[:user_mention_columns].times { |i| row << users[i].to_s }
  end

  row
end

#replay_from(filename, &block) ⇒ Object



215
216
217
218
219
220
221
222
223
224
225
226
# File 'lib/twitter_to_csv/csv_builder.rb', line 215

def replay_from(filename, &block)
  # If a retweet mode is being used, we read the file backwards using the Elif gem.
  opener = options[:retweet_mode] ? Elif : File

  opener.open(filename, "r") do |file|
    file.each do |line|
      next if line =~ /\A------SEP.RATOR------\Z/i
      handle_status JSON.parse(line), &block
    end
  end
  puts "Last status seen at #{@last_status_seen_at}." if options[:analyze_gaps] && @last_status_seen_at
end

#run(&block) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/twitter_to_csv/csv_builder.rb', line 18

def run(&block)
  log_csv_header if options[:csv] && !options[:csv_appending]
  if options[:replay_from_file]
    replay_from options[:replay_from_file], &block
  else
    begin
      TwitterWatcher.new(options).run do |status|
        handle_status status
      end
    rescue SignalException, SystemExit
      EventMachine::stop_event_loop
      exit
    rescue StandardError => e
      STDERR.puts "\nException #{e.message}:\n#{e.backtrace.join("\n")}\n\n"
      STDERR.puts "Waiting for a couple of minutes..."
      sleep 120
      retry
    end
  end
end

#sample_fields(status) ⇒ Object



241
242
243
244
245
246
247
248
249
250
251
# File 'lib/twitter_to_csv/csv_builder.rb', line 241

def sample_fields(status)
  extract_fields(status, sampled_fields)
  @num_samples += 1
  if @num_samples > options[:sample_fields]
    puts "Sampled fields from Twitter:"
    sampled_fields.each do |field, count|
      puts " #{field} #{' ' * [60 - field.length, 0].max} #{count}"
    end
    exit 1
  end
end

#within_time_window?(status) ⇒ Boolean

Returns:

  • (Boolean)


39
40
41
42
43
44
45
46
# File 'lib/twitter_to_csv/csv_builder.rb', line 39

def within_time_window?(status)
  if options[:start_time] || options[:end_time]
    created_at = status['created_at'].is_a?(Time) ? status['created_at'] : Time.parse(status['created_at'])
    return false if options[:start_time] && created_at < options[:start_time]
    return false if options[:end_time] && created_at >= options[:end_time]
  end
  true
end

#yield_status(status, &block) ⇒ Object



134
135
136
# File 'lib/twitter_to_csv/csv_builder.rb', line 134

def yield_status(status, &block)
  block.call output_row(status)
end