Class: SimpleXmlParser::Parser

Inherits:
Nokogiri::XML::SAX::Document
  • Object
show all
Defined in:
lib/simple_xml_parser/parser.rb

Overview

For record filter, pass a lambda that takes a record as a parameter, and returns true to include it or false to exclude it, e.g. to include only records with a “title”, do this: processor.record_filter = ->(rec) { rec.title } If a field name has been changed via the field_name_renames hash, the new name should be used in the filter.

Direct Known Subclasses

FideXmlParser

Constant Summary collapse

ANSI_GO_TO_LINE_START =
"\e[1G"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(array_name:, record_name:, integer_fields: nil, key_filter: nil, record_filter: nil, field_name_renames: nil) ⇒ Parser

Returns a new instance of Parser.



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/simple_xml_parser/parser.rb', line 41

def initialize(array_name:, record_name:, integer_fields: nil,
               key_filter: nil, record_filter: nil, field_name_renames: nil)
  @array_name = array_name
  @record_name = record_name
  @integer_fields = integer_fields
  @key_filter = key_filter
  @record_filter = record_filter
  @field_name_renames = field_name_renames
  @current_property_name = nil
  @record = {}
  @records = []
  @start_time = current_time
  @keys_to_exclude = []
  @input_record_count = 0
  @output_record_count = 0
end

Instance Attribute Details

#array_nameObject (readonly)

Constructor parameters:



31
32
33
# File 'lib/simple_xml_parser/parser.rb', line 31

def array_name
  @array_name
end

#current_property_nameObject

For internal use:



37
38
39
# File 'lib/simple_xml_parser/parser.rb', line 37

def current_property_name
  @current_property_name
end

#field_name_renamesObject

User-provided callbacks:



34
35
36
# File 'lib/simple_xml_parser/parser.rb', line 34

def field_name_renames
  @field_name_renames
end

#input_record_countObject

For internal use:



37
38
39
# File 'lib/simple_xml_parser/parser.rb', line 37

def input_record_count
  @input_record_count
end

#integer_fieldsObject (readonly)

Constructor parameters:



31
32
33
# File 'lib/simple_xml_parser/parser.rb', line 31

def integer_fields
  @integer_fields
end

#key_filterObject

User-provided callbacks:



34
35
36
# File 'lib/simple_xml_parser/parser.rb', line 34

def key_filter
  @key_filter
end

#output_record_countObject

For internal use:



37
38
39
# File 'lib/simple_xml_parser/parser.rb', line 37

def output_record_count
  @output_record_count
end

#recordObject

For internal use:



37
38
39
# File 'lib/simple_xml_parser/parser.rb', line 37

def record
  @record
end

#record_filterObject

User-provided callbacks:



34
35
36
# File 'lib/simple_xml_parser/parser.rb', line 34

def record_filter
  @record_filter
end

#record_nameObject (readonly)

Constructor parameters:



31
32
33
# File 'lib/simple_xml_parser/parser.rb', line 31

def record_name
  @record_name
end

#recordsObject

For internal use:



37
38
39
# File 'lib/simple_xml_parser/parser.rb', line 37

def records
  @records
end

#start_timeObject (readonly)

Returns the value of attribute start_time.



28
29
30
# File 'lib/simple_xml_parser/parser.rb', line 28

def start_time
  @start_time
end

Instance Method Details

#characters(string) ⇒ Object



128
129
130
131
132
133
134
# File 'lib/simple_xml_parser/parser.rb', line 128

def characters(string)
  if current_property_name && include_this_field?(current_property_name)
    key = output_field_name(current_property_name)
    value = maybe_convert_to_integer(current_property_name, string)
    record[key] = value
  end
end

#current_timeObject



67
68
69
# File 'lib/simple_xml_parser/parser.rb', line 67

def current_time
  Process.clock_gettime(Process::CLOCK_MONOTONIC)
end

#end_element(name) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/simple_xml_parser/parser.rb', line 95

def end_element(name)
  case name
  when array_name  # end of data, write JSON file
    finish
  when record_name
    if record_filter.nil? || record_filter.(record)
      self.output_record_count += 1
      records << record
    end
    self.record = {}
  else
    self.current_property_name = nil
  end
end

#finishObject



137
138
139
140
# File 'lib/simple_xml_parser/parser.rb', line 137

def finish
  output_status
  puts
end

#include_this_field?(field_name) ⇒ Boolean

Returns:

  • (Boolean)


123
124
125
# File 'lib/simple_xml_parser/parser.rb', line 123

def include_this_field?(field_name)
  key_filter.nil? || key_filter.(field_name)
end

#maybe_convert_to_integer(field_name, value) ⇒ Object



117
118
119
120
# File 'lib/simple_xml_parser/parser.rb', line 117

def maybe_convert_to_integer(field_name, value)
  needs_conversion = integer_fields&.include?(field_name)
  needs_conversion ? Integer(value) : value
end

#output_field_name(input_field_name) ⇒ Object



111
112
113
114
# File 'lib/simple_xml_parser/parser.rb', line 111

def output_field_name(input_field_name)
  return input_field_name if field_name_renames.nil?
  field_name_renames[input_field_name] || input_field_name
end

#output_statusObject



72
73
74
75
76
77
78
79
# File 'lib/simple_xml_parser/parser.rb', line 72

def output_status
  print ANSI_GO_TO_LINE_START
  print "Records processed: %9d   kept: %9d    Seconds elapsed: %11.2f" % [
      input_record_count,
      output_record_count,
      current_time - start_time
  ]
end

#parse(data_source) ⇒ Object



59
60
61
62
63
64
# File 'lib/simple_xml_parser/parser.rb', line 59

def parse(data_source)
  data_source = File.new(data_source) if data_source.is_a?(String)
  parser = Nokogiri::XML::SAX::Parser.new(self)
  parser.parse(data_source)
  records
end

#start_element(name, _attrs) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
# File 'lib/simple_xml_parser/parser.rb', line 82

def start_element(name, _attrs)
  case name
  when array_name
    # ignore
  when record_name
    self.input_record_count += 1
    output_status if input_record_count % 1000 == 0
  else # this is a field in the players record; process it as such
    self.current_property_name = name
  end
end