Class: Regex::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/regex/extractor.rb

Overview

Supports [:name:] notation for subsitution of built-in templates.

Defined Under Namespace

Classes: Match

Constant Summary collapse

DELIMINATOR_GROUP =

When the regular expression return multiple groups, each is divided by the group deliminator. This is the default value.

29.chr + "\n"
DELIMINATOR_RECORD =

When using repeat mode, each match is divided by the record deliminator. This is the default value.

30.chr + "\n"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*io) ⇒ Extractor

New extractor.



78
79
80
81
82
83
84
85
86
87
# File 'lib/regex/extractor.rb', line 78

def initialize(*io)
  options = Hash === io.last ? io.pop : {}

  @io   = io
  @ansi = true

  options.each do |k,v|
    __send__("#{k}=", v)
  end
end

Instance Attribute Details

#ansiObject

Use ANSI codes in output?



72
73
74
# File 'lib/regex/extractor.rb', line 72

def ansi
  @ansi
end

#detailObject

Provide detailed output.



69
70
71
# File 'lib/regex/extractor.rb', line 69

def detail
  @detail
end

#escapeObject

Escape expression.



60
61
62
# File 'lib/regex/extractor.rb', line 60

def escape
  @escape
end

#formatObject

Output format.



66
67
68
# File 'lib/regex/extractor.rb', line 66

def format
  @format
end

#indexObject

Index of expression return.



51
52
53
# File 'lib/regex/extractor.rb', line 51

def index
  @index
end

#insensitiveObject

Ignore case.



57
58
59
# File 'lib/regex/extractor.rb', line 57

def insensitive
  @insensitive
end

#ioObject

List of IO objects or Strings to search.



36
37
38
# File 'lib/regex/extractor.rb', line 36

def io
  @io
end

#multilineObject

Multiline match.



54
55
56
# File 'lib/regex/extractor.rb', line 54

def multiline
  @multiline
end

#patternObject

Regular expression.



42
43
44
# File 'lib/regex/extractor.rb', line 42

def pattern
  @pattern
end

#recursiveObject

Is a recusive serach?



48
49
50
# File 'lib/regex/extractor.rb', line 48

def recursive
  @recursive
end

#repeatObject

Repeat Match (global).



63
64
65
# File 'lib/regex/extractor.rb', line 63

def repeat
  @repeat
end

#templateObject

Select built-in regular expression by name.



45
46
47
# File 'lib/regex/extractor.rb', line 45

def template
  @template
end

#unxmlObject

Remove XML tags from search. (NOT CURRENTLY SUPPORTED)



39
40
41
# File 'lib/regex/extractor.rb', line 39

def unxml
  @unxml
end

Class Method Details

.cli(argv = ARGV) ⇒ Object

Commandline Interface to Extractor.



336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
# File 'lib/regex/extractor.rb', line 336

def self.cli(argv=ARGV)
  require 'optparse'
  format  = nil
  options = {}
  parser = OptionParser.new do |opt|
    opt.on('--template', '-t NAME', "select a built-in regular expression") do |name|
      options[:template] = name
    end
    opt.on('--search', '-s PATTERN', "search for regular expression") do |re|
      options[:pattern] = re
    end
    opt.on('--recursive', '-R', 'search recursively though subdirectories') do
      options[:recursive] = true
    end
    opt.on('--escape', '-e', 'make all patterns verbatim string matchers') do
      options[:escape] = true
    end
    opt.on('--index', '-n INT', "return a specific match index") do |int|
      options[:index] = int.to_i
    end
    opt.on('--insensitive', '-i', "case insensitive matching") do
      options[:insensitive] = true
    end
    opt.on('--multiline', '-m', "multiline matching") do
      options[:multiline] = true
    end
    #opt.on('--unxml', '-x', "ignore XML/HTML tags") do
    #  options[:unxml] = true
    #end
    opt.on('--global', '-g', "find all matching occurances") do
      options[:repeat] = true
    end
    opt.on('--yaml', '-y', "output in YAML format") do
      format = :yaml
    end
    opt.on('--json', '-j', "output in JSON format") do
      format = :json
    end
    opt.on('--detail', '-d', "provide match details") do
      options[:detail] = :json
    end
    opt.on('--[no-]ansi', "toggle ansi color") do |val|
      options[:ansi] = val
    end
    opt.on_tail('--debug', 'run in debug mode') do
      $DEBUG = true
    end
    opt.on_tail('--help', '-h', "display this lovely help message") do
      puts opt
      exit 0
    end
  end
  parser.parse!(argv)

  unless options[:pattern] or options[:template]
    re = argv.shift
    case re
    when /^\/(.*?)\/(\w*?)$/
      options[:pattern] = $1
      $2.split(//).each do |c|
        case c
        when 'e' then options[:escape] = true
        when 'g' then options[:repeat] = true
        when 'i' then options[:insensitive] = true
        end
      end
    else
      options[:template] = re
    end
  end

  files = []
  argv.each do |file|
    if File.directory?(file)
      if options[:recursive]
        rec_files = Dir[File.join(file, '**')].reject{ |d| File.directory?(d) }
        files.concat(rec_files)
      end
    elsif File.file?(file)
      files << file
    else
      $stderr.puts "Not a file -- '#{file}'."
      exit 1
    end
  end

  if files.empty?
    args = [ARGF]
  else
    args = files.map{ |f| open(f) } #File.new(f) }
  end

  args << options

  extract = new(*args)

  puts extract.to_s(format)
end

.input_cache(input) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
# File 'lib/regex/extractor.rb', line 23

def self.input_cache(input)
  @input_cache ||= {}
  @input_cache[input] ||= (
    case input
    when String
      input
    else
      input.read
    end
  )
end

Instance Method Details

#ansi?Boolean

Use ANSI codes in output?

Returns:

  • (Boolean)


75
# File 'lib/regex/extractor.rb', line 75

def ansi? ; @ansi ; end

#bold(str) ⇒ Object



436
437
438
439
440
441
442
# File 'lib/regex/extractor.rb', line 436

def bold(str)
  if ansi?
    "\e[1m" + str + "\e[0m"
  else
    string
  end
end

#deliminator_groupObject



327
328
329
# File 'lib/regex/extractor.rb', line 327

def deliminator_group
  DELIMINATOR_GROUP
end

#deliminator_recordObject



331
332
333
# File 'lib/regex/extractor.rb', line 331

def deliminator_record
  DELIMINATOR_RECORD
end

#formatted_match(input, match) ⇒ Object



224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/regex/extractor.rb', line 224

def formatted_match(input, match)
  string = []
  path = (File === input ? input.path : "(io #{input.object_id})")
  part, char, line = match.info(0)
  if index
    part, char, line = match.info(index)
    string << "%s %s %s" % [line, char, part.inspect]
  else
    string << bold("%s %s %s" % [line, char, part.inspect])
    if match.size > 0
      (1...match.size).each do |i|
        part, char, line = match.info(i)
        string << "#{i}. %s %s %s" % [line, char, part.inspect]
      end
    end
  end
  string.join("\n")
end

#inspectObject



90
91
92
# File 'lib/regex/extractor.rb', line 90

def inspect
  "#{self.class.name}"
end

#line_at(io, char) ⇒ Object

Return the line number of the char position within text.



323
324
325
# File 'lib/regex/extractor.rb', line 323

def line_at(io, char)
  read(io)[0..char].count("\n") + 1
end

#mappingObject



304
305
306
307
308
309
310
# File 'lib/regex/extractor.rb', line 304

def mapping
  hash = Hash.new{ |h,k| h[k]=[] }
  scan.each do |match|
    hash[match.input] << match
  end
  hash
end

#matches_by_pathObject



244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/regex/extractor.rb', line 244

def matches_by_path
  r = Hash.new{ |h,k| h[k] = [] }
  h = Hash.new{ |h,k| h[k] = [] }
  scan.each do |match|
    h[match.input] << match
  end
  h.each do |input, matches|
    path = (File === input ? input.path : "(io #{input.object_id})")
    if index
      matches.each do |match|
        r[path] << match.breakdown[index]
      end
    else
      matches.each do |match|
        r[path] << match.breakdown
      end
    end
  end
  r
end

#output_detailed_textObject

Detailed text output.



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/regex/extractor.rb', line 195

def output_detailed_text
  if repeat
    count  = 0
    string = []
    mapping.each do |input, matches|
      path = (File === input ? input.path : "(io #{input.object_id})")
      string << ""
      string << bold(path)
      matches.each do |match|
        string << formatted_match(input, match)
        count += 1
      end
    end
    string.join("\n") + "\n"
    string << "\n(#{count} matches)"
  else
    string = []
    match  = scan.first
    input  = match.input
    path   = (File === input ? input.path : "(io #{input.object_id})")
    string << ""
    string << bold(path)
    string << formatted_match(input, match)
    string.join("\n")
    string << "" #"\n1 match"
  end
end

#output_textObject



183
184
185
186
187
188
189
190
191
192
# File 'lib/regex/extractor.rb', line 183

def output_text
  out = structure
  if repeat
    out = out.map{ |m| m.join(deliminator_group) }
    out = out.join(deliminator_record) #.chomp("\n") + "\n"
  else
    out = out.join(deliminator_group) #.chomp("\n") + "\n"
  end
  out
end

#read(input) ⇒ Object

TODO: unxml won’t give corrent char counts.



313
314
315
316
317
318
319
320
# File 'lib/regex/extractor.rb', line 313

def read(input)
  Extractor.input_cache(input)
  #  if unxml
  #    txt.gsub(/\<(.*?)\>/, '')
  #  else
  #    txt
  #  end
end

#regexObject



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/regex/extractor.rb', line 108

def regex
  @regex ||= (
    if template
      Templates.const_get(template.upcase)
    else
      case pattern
      when Regexp
        pattern
      when String
        flags = 0
        flags + Regexp::MULTILINE  if multiline
        flags + Regexp::IGNORECASE if insensitive
        if escape
          Regexp.new(Regexp.escape(pattern), flags)
        else
          pat = substitute_templates(pattern)
          Regexp.new(pat, flags)
        end
      end
    end
  )
end

#scanObject

Scan inputs for matches.

Return an associative Array of [input, matchdata].



287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# File 'lib/regex/extractor.rb', line 287

def scan
  list = []
  io.each do |input|
    # TODO: limit to text files, how?
    begin
      text = read(input)
      text.scan(regex) do
        list << Match.new(input, $~)
      end
    rescue => err
      warn(input.inspect + ' ' + err.to_s) if $VERBOSE
    end
  end
  list
end

#structureObject

Structure the matchdata according to specified options.



266
267
268
# File 'lib/regex/extractor.rb', line 266

def structure
  repeat ? structure_repeat : structure_single
end

#structure_repeatObject

Structure the matchdata for repeat matches.



276
277
278
279
280
281
282
# File 'lib/regex/extractor.rb', line 276

def structure_repeat
  if index
    scan.map{ |match| [match[index]] } 
  else
    scan.map{ |match| match.size > 1 ? match[1..-1] : [match[0]] }
  end
end

#structure_singleObject

Structure the matchdata for single match.



271
272
273
# File 'lib/regex/extractor.rb', line 271

def structure_single
  structure_repeat.first || []
end

#substitute_templates(pattern) ⇒ Object



132
133
134
135
136
137
138
139
140
# File 'lib/regex/extractor.rb', line 132

def substitute_templates(pattern)
  pat = pattern
  Templates.list.each do |name|
    if pat.include?("[:#{name}:]")
      pat = pat.gsub(/(?!:\\)\[\:#{name}\:\]/, Templates[name].to_s)
    end
  end
  pat
end

#to_s(format = nil) ⇒ Object



143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/regex/extractor.rb', line 143

def to_s(format=nil)
  case format
  when :yaml
    to_s_yaml
  when :json
    to_s_json
  else
    if detail
      output_detailed_text
    else
      output_text
    end
  end
end

#to_s_jsonObject



169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/regex/extractor.rb', line 169

def to_s_json
  begin
    require 'json'
  rescue LoadError
    require 'json_pure' 
  end
  if detail
    matches_by_path.to_json
  else
    structure.to_json
  end
end

#to_s_yamlObject



159
160
161
162
163
164
165
166
# File 'lib/regex/extractor.rb', line 159

def to_s_yaml
  require 'yaml'
  if detail
    matches_by_path.to_yaml
  else
    structure.to_yaml
  end
end