Class: SkinnyJeans::LogParser

Inherits:
Object
  • Object
show all
Defined in:
lib/skinny_jeans/log_parser.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp) ⇒ LogParser

Returns a new instance of LogParser.



11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/skinny_jeans/log_parser.rb', line 11

def initialize(logfile_path, sqlite_db_path, path_regexp, date_regexp)
  @logfile_path, @sqlite_db_path, @path_regexp, @date_regexp = [logfile_path, sqlite_db_path, path_regexp, date_regexp]
  @is_gzipped = !logfile_path.to_s[/gz/].nil?
  SkinnyJeans::prepare_db(@sqlite_db_path)

  # in my tests, setting synchronous=1 (or even 0) had little or no speed gain
  # but I leave it at one for situations with poor IO
  SkinnyJeans::SkinnyJeanDb.connection.execute("PRAGMA synchronous=1")

  @hash_of_dates = {}
  @hash_of_dates_for_keywords = {}
  @last_datetime = nil
end

Instance Attribute Details

#hash_of_datesObject

Returns the value of attribute hash_of_dates.



9
10
11
# File 'lib/skinny_jeans/log_parser.rb', line 9

def hash_of_dates
  @hash_of_dates
end

#hash_of_dates_for_keywordsObject

Returns the value of attribute hash_of_dates_for_keywords.



9
10
11
# File 'lib/skinny_jeans/log_parser.rb', line 9

def hash_of_dates_for_keywords
  @hash_of_dates_for_keywords
end

#last_pageview_atObject

Returns the value of attribute last_pageview_at.



9
10
11
# File 'lib/skinny_jeans/log_parser.rb', line 9

def last_pageview_at
  @last_pageview_at
end

Class Method Details

.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp) ⇒ Object



5
6
7
# File 'lib/skinny_jeans/log_parser.rb', line 5

def self.execute(logfile_path, sqlite_db_path, path_regexp, date_regexp)
  self.new(logfile_path, sqlite_db_path, path_regexp, date_regexp).execute
end

Instance Method Details

#execute(options = {}) ⇒ Object



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/skinny_jeans/log_parser.rb', line 26

def execute(options = {})

  lines_parsed = 0
  last_line_parsed, last_pageview_at, lineno_of_last_line_parsed = [nil,nil,nil]
  # last_update = Update.order("id DESC").limit(1).first
  last_update = Update.find(:first, :order => "id DESC", :limit => 1)

  # see if the last_line_parsed parsed exists in the current log file
  # if it doesnt exist, we'll simply read anything with a timestamp greater than last_pageview_at
  if last_update
    last_pageview_at, last_line_parsed = last_update.last_pageview_at, last_update.last_line_parsed
    file_reader do |line, lineno|
      if line.to_s[0..254] == last_line_parsed.to_s[0..254]
        lineno_of_last_line_parsed = lineno
        break
      end
    end
    puts "last line parsed was\n#{last_line_parsed}\nat lineno #{lineno_of_last_line_parsed}"
  end

  realtime = Benchmark.realtime do
    date_path_pairs_array = []
    lineno = -1

    file_reader do |line, index|
      lineno += 1
      next if lineno_of_last_line_parsed && lineno <= lineno_of_last_line_parsed

      next if line.strip == ""

      # try to block most of the bots
      _user_agent = line.split('"')[5] # parse out the user agent... this should be configurable for now
      next if !_user_agent[/Mozilla\/|Opera/] || !!_user_agent[/bot|crawler|spider|slurp/i]

      begin
        next if line[/\s\d\d\d\s/].strip.to_i != 200
        path_match = line[@path_regexp, 1]
        next if path_match.nil?
        date_match = line[@date_regexp, 1]
        next if date_match.nil?
        datetime_obj = parse_string_as_date(date_match)
      rescue ArgumentError => e
        if e.message.match(/invalid byte sequence in UTF-8/)
          puts "failed to parse the following line because of #{e.class.name}: #{e.message}"
          puts line
          next
        else
          raise(e)
        end
      end
      next if lineno_of_last_line_parsed.nil? && !last_pageview_at.nil? && datetime_obj < last_pageview_at

      insert_or_increment(datetime_obj, path_match, SkinnyJeans::StringParser.extract_search_query(line))
      @last_pageview_at = datetime_obj
      last_line_parsed = line.to_s[0..254] # only 255 characters because we store it in the database
      lines_parsed += 1
    end
  end

  puts "completed parsing in #{realtime}"

  persisted = 0
  persisted_pageview_keywords = 0
  realtime = Benchmark.realtime do

    hash_of_dates.each do |date, hash_of_paths|

    Pageview.transaction do
    realtime = Benchmark.realtime do
    Spinner::with_spinner(:count=>hash_of_paths.keys.size, :message=>"Inserting rows into database for pageviews #{date}...") do |spin|
        hash_of_paths.keys.each_with_index do |path, index|
          pv = Pageview.find_or_create_by_date_and_path(date, path)
          pv.pageview_count ||= 0
          pv.pageview_count += hash_of_paths[path]
          pv.save!
          persisted += 1
          spin.call
        end
    end
    end
    end

    puts "completed pageviews date #{date.inspect} with #{hash_of_paths.keys.size} keys in #{realtime}"

    end

    hash_of_dates_for_keywords.each do |date, hash_of_paths|
      PageviewKeyword.transaction do
      realtime = Benchmark.realtime do
      Spinner::with_spinner(:count=>hash_of_paths.keys.size, :message=>"Inserting rows into database for pageview_keywords #{date}...") do |spin|
        hash_of_paths.keys.each do |path|
          hash_of_paths[path].keys.each do |keyword|
            pvk = PageviewKeyword.find_or_create_by_date_and_path_and_keyword(date, path, keyword)
            pvk.keyword = keyword.to_s[0..254]
            pvk.pageview_count ||= 0
            pvk.pageview_count += hash_of_paths[path][keyword]
            pvk.save!
            persisted_pageview_keywords += 1
          end
          spin.call
        end
      end
      end
      end
      puts "completed pageview_keywords date #{date.inspect} with #{hash_of_paths.keys.size} keys in #{realtime}"
    end

  end

  puts "completed persistence in #{realtime}"

  Update.create!({:last_pageview_at => self.last_pageview_at, :lines_parsed => lines_parsed, :last_line_parsed => last_line_parsed.to_s[0..254]})

puts("total records in DB: #{Pageview.count}
lines parsed this round: #{lines_parsed}
lines persisted this round:#{persisted}
total SkinnyJeans executions since inception: #{Update.count}")

if options[:vacuum]
  puts("vacuuming DB")
  SkinnyJeans::SkinnyJeanDb.connection.execute("VACUUM") 
  puts("vacuuming complete")
end

# i dont think we need to keep these now that we've successfully persisted
@hash_of_dates = nil
@hash_of_dates_for_keywords = nil

  return self

end

#file_readerObject

copies the log file, reads it, then removes it



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/skinny_jeans/log_parser.rb', line 159

def file_reader

  temp_file_path = ("/tmp/"<<File.basename("#{@logfile_path}.copy"))
  temp_file = FileUtils.cp(@logfile_path, temp_file_path)

  if @is_gzipped
    lineno = 0
    Zlib::GzipReader.new(File.new(temp_file_path, "r")).each_line{|line|yield([line,lineno]);lineno+=1}
    # Zlib::GzipReader.open(@logfile_path).each_line{|line|yield([line,lineno]);lineno+=1}
  else
    File.new(temp_file_path, "r").each_with_index{|line, lineno| yield([line,lineno])}
  end

  FileUtils.rm_f(temp_file_path)
end

#get_ar_class(klass) ⇒ Object



179
180
181
# File 'lib/skinny_jeans/log_parser.rb', line 179

def get_ar_class(klass)
  begin;return(klass);rescue(ActiveRecord::ConnectionNotEstablished);SkinnyJeans::prepare_db(@sqlite_db_path);end
end

#pageviewObject



175
# File 'lib/skinny_jeans/log_parser.rb', line 175

def pageview;get_ar_class(Pageview);end

#pageview_keywordObject



177
# File 'lib/skinny_jeans/log_parser.rb', line 177

def pageview_keyword;get_ar_class(PageviewKeyword);end

#updateObject



176
# File 'lib/skinny_jeans/log_parser.rb', line 176

def update;get_ar_class(Update);end