Class: Watson::Parser

Inherits:
Object
  • Object
show all
Includes:
Watson
Defined in:
lib/watson/parser.rb

Overview

Dir/File parser class Contains all necessary methods to parse through files and directories for specified tags and generate data structure containing found issues

Constant Summary collapse

COMMENT_DEFINITIONS =
{
    '.cpp'     => ['//', '/*'],        # C++
    '.cxx'     => ['//', '/*'],
    '.cc'      => ['//', '/*'],
    '.hpp'     => ['//', '/*'],
    '.hxx'     => ['//', '/*'],
    '.c'       => ['//', '/*'],        # C
    '.h'       => ['//', '/*'],
    '.java'    => ['//', '/*', '/**'], # Java
    '.class'   => ['//', '/*', '/**'],
    '.cs'      => ['//', '/*'],        # C#
    '.scss'    => ['//', '/*'],        # SASS SCSS
    '.sass'    => ['//', '/*'],        # SASS SCSS
    '.js'      => ['//', '/*'],        # JavaScript
    '.php'     => ['//', '/*', '#'],   # PHP
    '.m'       => ['//', '/*'],        # ObjectiveC
    '.mm'      => ['//', '/*'],
    '.go'      => ['//', '/*'],        # Go(lang)
    '.scala'   => ['//', '/*'],        # Scala
    '.erl'     => ['%%', '%'],         # Erlang
    '.f'       => ['!'],               # Fortran
    '.f90'     => ['!'],               # Fortran
    '.F'       => ['!'],               # Fortran
    '.F90'     => ['!'],               # Fortran
    '.hs'      => ['--'],              # Haskell
    '.sh'      => ['#'],               # Bash
    '.rb'      => ['#'],               # Ruby
    '.haml'    => ['-#'],              # Haml
    '.pl'      => ['#'],               # Perl
    '.pm'      => ['#'],
    '.t'       => ['#'],
    '.py'      => ['#'],               # Python
    '.coffee'  => ['#'],               # CoffeeScript
    '.zsh'     => ['#'],               # Zsh
    '.clj'     => [';;'],              # Clojure
    '.sql'     => ['---', '//', '#' ], # SQL and PL types
    '.lua'     => ['--', '--[['],      # Lua
    '.vim'     => ['"'],               # VimL
    '.md'      => ['<!--'],            # Markdown
    '.html'    => ['<!--'],            # HTML
    '.el'      => [';'],               # Emacslisp
    '.sqf'     => ['//','/*'],         # SQF
    '.sqs'     => [';'],               # SQS
    '.d'       => ['//','/*'],         # D
    '.tex'     => ['%'],               # LaTex
    '.hbs'     => ['{{!--'],           # Handlebars
    '.twig'    => ['{#']               # Twig
}.freeze

Constants included from Watson

BLUE, BOLD, CYAN, GRAY, GREEN, MAGENTA, RED, RESET, UNDERLINE, VERSION, WHITE, YELLOW

Instance Method Summary collapse

Methods included from Watson

check_less, debug_print

Constructor Details

#initialize(config) ⇒ Parser

Initialize the parser with the current watson config



67
68
69
70
71
72
73
74
# File 'lib/watson/parser.rb', line 67

def initialize(config)
  # [review] - Not sure if passing config here is best way to access it

  # Identify method entry
  debug_print "#{ self } : #{ __method__ }\n"

  @config = config
end

Instance Method Details

#get_comment_type(filename) ⇒ Object

Get comment syntax for given file



413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# File 'lib/watson/parser.rb', line 413

def get_comment_type(filename)

  # Identify method entry
  debug_print "#{ self } : #{ __method__ }\n"

  # Merge config file type list with defaults
  _comments = COMMENT_DEFINITIONS.merge(@config.type_list)

  # Grab all possible extensions, check for comment match in reverse order
  # Return comment type if found in comment definitions, else false
  filename.split('.')[1..-1].each { |_ext| return _comments['.' << _ext] if _comments.has_key?('.' << _ext) }

  debug_print "Couldn't find any recognized extension type\n"
  false

end

#parse_dir(dir, depth) ⇒ Object

Parse through specified directory and find all subdirs and files



118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/watson/parser.rb', line 118

def parse_dir(dir, depth)

  # Identify method entry
  debug_print "#{ self } : #{ __method__ }\n"

  # Error check on input
  if Watson::FS.check_dir(dir)
    debug_print "Opened #{ dir } for parsing\n"
  else
    print "Unable to open #{ dir }, exiting\n"
    return false
  end

  debug_print "Parsing through all files/directories in #{ dir }\n"

  # [review] - Shifted away from single Dir.glob loop to separate for dir/file
  #        This duplicates code but is much better for readability
  #        Not sure which is preferred?


  # Remove leading . or ./
  _glob_dir = dir.gsub(/^\.(\/?)/, '')
  debug_print "_glob_dir: #{_glob_dir}\n"


  # Go through directory to find all files
  # Create new array to hold all parsed files
  _completed_files = Array.new()
  Dir.glob("#{ _glob_dir }{*,.*}").select { |_fn| File.file?(_fn) }.sort.each do |_entry|
    debug_print "Entry: #{_entry} is a file\n"


    # [review] - Warning to user when file is ignored? (outside of debug_print)
    # Check against ignore list, if match, set to "" which will be ignored
    @config.ignore_list.each do |_ignore|
      if _mtch = _entry.match(_ignore)
        _entry = ''
        break
      end
    end

    # If the resulting entry (after filtering) isn't empty, parse it and push into file array
    unless _entry.empty?
      debug_print "Parsing #{ _entry }\n"
      _completed_files.push(parse_file(_entry))
    end

  end


  # Go through directory to find all subdirs
  # Create new array to hold all parsed subdirs
  _completed_dirs = Array.new()
  Dir.glob("#{ _glob_dir }{*, .*}").select { |_fn| File.directory?(_fn) }.sort.each do |_entry|
    debug_print "Entry: #{ _entry } is a dir\n"

    # Check if entry is in ignore list
    _skip = false

    @config.ignore_list.each do |_ignore|
      if mtch = _entry.match(_ignore)
        _skip = true
      end
    end

    debug_print "#{ _entry } was not on ignorelist, adding\n"

    # If directory is on the ignore list then skip
    next if _skip == true

    ## Depth limit logic
    # Current depth is depth of previous parse_dir (passed in as second param) + 1
    _cur_depth = depth + 1
    debug_print "Current Folder depth: #{ _cur_depth }\n"

    # If Config.parse_depth is 0, no limit on subdir parsing
    if @config.parse_depth == 0
      debug_print "No max depth, parsing directory\n"
      _completed_dirs.push(parse_dir("#{ _entry }/", _cur_depth))

    # If current depth is less than limit (set in config), parse directory and pass depth
    elsif _cur_depth < @config.parse_depth.to_i + 1
      debug_print "Depth less than max dept (from config), parsing directory\n"
      _completed_dirs.push(parse_dir("#{ _entry }/", _cur_depth))

    # Else, depth is greater than limit, ignore the directory
    else
      debug_print "Depth greater than max depth, ignoring\n"
    end

    # Add directory to ignore list so it isn't repeated again accidentally
    @config.ignore_list.push(_entry)
  end


  # [review] - Not sure if Dir.glob requires a explicit directory/file close?

  # Create hash to hold all parsed files and directories
  _structure           = Hash.new()
  _structure[:curdir]  = dir
  _structure[:files]   = _completed_files
  _structure[:subdirs] = _completed_dirs
  _structure
end

#parse_file(filename) ⇒ Object

Parse through individual files looking for issue tags, then generate formatted issue hash noinspection RubyResolve



227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
# File 'lib/watson/parser.rb', line 227

def parse_file(filename)
  # [review] - Rename method input param to filename (more verbose?)

  # Identify method entry
  debug_print "#{ self } : #{ __method__ }\n"

  _relative_path = filename
  _absolute_path = File.absolute_path(filename)

  # Error check on input, use input filename to make sure relative path is correct
  if Watson::FS.check_file(_relative_path)
    debug_print "Opened #{ _relative_path } for parsing\n"
    debug_print "Short path: #{ _relative_path }\n"
  else
    print "Unable to open #{ _relative_path }, exiting\n"
    return false
  end


  # Get filetype and set corresponding comment type
  _comment_type = get_comment_type(_relative_path)
  unless _comment_type
    debug_print "Using default (#) comment type\n"
    _comment_type = ['#']
  end

  # Escape out comment type for safety
  # [review] - Is there a way to do inplace join?
  _comment_type = _comment_type.map { |comment| Regexp.escape(comment) }.join("|")
  debug_print "Comment type #{ _comment_type }\n"

  # [review] - It is possible to embed the valid tags in the regexp,
  # with a ~5% performance gain, but this would loose the warning about
  # unrecognized tags.
  _tag_format = Regexp.escape(@config.tag_format).gsub('\\ ', ' ')
  _tag_format_regex = _tag_format
                          .gsub("TAG", '(\w+)')
                          .gsub("COMMENT", '(.+)')
                          .gsub(' ' , '\s+')

  _comment_regex = /^(?:\s*[#{_comment_type}]+\s*)+#{_tag_format_regex}/

  debug_print "Comment regex: #{_comment_regex}\n"

  # Open file and read in entire thing into an array
  # Use an array so we can look ahead when creating issues later
  # [review] - Better var name than data for read in file?
  _data = File.read(_absolute_path).encode('UTF-8', :invalid => :replace).lines

  # Initialize issue list hash
  _issue_list = Hash.new()
  _issue_list[:relative_path] = _relative_path
  _issue_list[:absolute_path] = _absolute_path
  _issue_list[:has_issues] = false
  @config.tag_list.each do | _tag |
    debug_print "Creating array named #{ _tag }\n"
    # [review] - Use to_sym to make tag into symbol instead of string?
    _issue_list[_tag] = Array.new
  end

  # Loop through all array elements (lines in file) and look for issues
  _data.each_with_index do |_line, _i|

    # Find any comment line with [tag] - text (any comb of space and # acceptable)
    # Using if match to stay consistent (with config.rb) see there for
    # explanation of why I do this (not a good good one persay...)
    begin
      _mtch = _line.match(_comment_regex)
    rescue ArgumentError
      debug_print "Could not encode to UTF-8, non-text\n"
    end

    unless _mtch
      # debug_print "No valid tag found in line, skipping\n"
      next
    end

    # Set tag
    _tag = _mtch[1].downcase

    # Make sure that the tag that was found is something we accept
    # If not, skip it but tell user about an unrecognized tag
    unless @config.tag_list.include?(_tag)
      formatter = Printer.new(@config).build_formatter
      formatter.print_status "+", GREEN
      print "Unknown tag [#{ _tag }] found, ignoring\n"
      print "      You might want to include it in your RC or with the -t/--tags flag\n"
      next
    end

    # Found a valid match (with recognized tag)
    # Set flag for this issue_list (for file) to indicate that
    _issue_list[:has_issues] = true

    # [review] - This could probably be done better, elsewhere!
    # If it's a HTML or Handlebars comment, remove trailing -->, --}}
    if _mtch[0].match(/[<{]+(!--)?(#)?/)
      _title = _mtch[2].gsub(/(--)?(#)?[>}]+/, "")
    else
      _title = _mtch[2]
    end
    debug_print "Issue found\n"
    debug_print "Tag: #{ _tag }\n"
    debug_print "Issue: #{ _title }\n"

    # Create hash for each issue found
    _issue               = Hash.new
    _issue[:line_number] = _i + 1
    _issue[:title]       = _title

    # Grab context of issue specified by Config param (+1 to include issue itself)
    _context             = _data[_i..(_i + @config.context_depth + 1)]

    # [review] - There has got to be a better way to do this...
    # Go through each line of context and determine indentation
    # Used to preserve indentation in post
    _cut                 = Array.new
    _context.each do |_line_sub|
      _max = 0
      # Until we reach a non indent OR the line is empty, keep slicin'
      until !_line_sub.match(/^( |\t|\n)/) || _line_sub.empty?
        # [fix] - Replace with inplace slice!
        _line_sub = _line_sub.slice(1..-1)
        _max      = _max + 1

        debug_print "New line: #{ _line_sub }\n"
        debug_print "Max indent: #{ _max }\n"
      end

      # Push max indent for current line to the _cut array
      _cut.push(_max)
    end

    # Print old _context
    debug_print "\n\n Old Context \n"
    debug_print PP.pp(_context, '')
    debug_print "\n\n"

    # Trim the context lines to be left aligned but maintain indentation
    # Then add a single \t to the beginning so the Markdown is pretty on GitHub/Bitbucket
    _context.map! { |_line_sub| "\t#{ _line_sub.slice(_cut.min .. -1) }" }

    # Print new _context
    debug_print("\n\n New Context \n")
    debug_print PP.pp(_context, '')
    debug_print("\n\n")

    _issue[:context] = _context

    # These are accessible from _issue_list, but we pass individual issues
    # to the remote poster, so we need this here to reference them for GitHub/Bitbucket
    _issue[:tag]     = _tag
    _issue[:path]    = _relative_path

    # Generate md5 hash for each specific issue (for bookkeeping)
    _issue[:md5]     = ::Digest::MD5.hexdigest("#{ _tag }, #{ _relative_path }, #{ _title }")
    debug_print "#{ _issue }\n"


    # [todo] - Figure out a way to queue up posts so user has a progress bar?
    # That way user can tell that wait is because of http calls not app

    # If GitHub is valid, pass _issue to GitHub poster function
    # [review] - Keep Remote as a static method and pass config every time?
    #       Or convert to a regular class and make an instance with @config


    # [review] - Use _tag string as symbol reference in hash or keep as string?
    # Look into to_sym to keep format of all _issue params the same
    _issue_list[_tag].push(_issue)

    # Increment issue counter for posting status
    @config.issue_count = @config.issue_count.next
  end

  # [review] - Return of parse_file is different than watson-perl
  # Not sure which makes more sense, ruby version seems simpler
  # perl version might have to stay since hash scoping is weird in perl
  debug_print "\nIssue list: #{ _issue_list }\n"

  _issue_list
end

#runObject

Begins parsing of files / dirs specified in the initial dir/file lists



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/watson/parser.rb', line 79

def run

  # Identify method entry
  debug_print "#{ self } : #{ __method__ }\n"

  # Go through all files added from CL (sort them first)
  # If empty, sort and each will do nothing, no errors
  _completed_dirs  = Array.new()
  _completed_files = Array.new()
  if @config.cl_entry_set
    @config.file_list.sort.each do |_file|
      _completed_files.push(parse_file(_file))
    end
  end

  # Then go through all the specified directories
  # Initial parse depth to parse_dir is 0 (unlimited)
  @config.dir_list.sort.each do |_dir|
    _completed_dirs.push(parse_dir(_dir, 0))
  end

  # Create overall hash for parsed files
  _structure           = Hash.new()
  _structure[:files]   = _completed_files
  _structure[:subdirs] = _completed_dirs

  debug_print "_structure dump\n\n"
  debug_print PP.pp(_structure, '')
  debug_print "\n\n"

  # Pass structure to poster with count as 0
  Remote.post_structure(_structure, @config, 0)

  _structure
end