Class: SportDb::Parser
- Inherits:
-
Object
- Object
- SportDb::Parser
- Defined in:
- lib/sportdb/parser/lang.rb,
lib/sportdb/parser/opts.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/linter.rb,
lib/sportdb/parser/parser.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb
Defined Under Namespace
Constant Summary collapse
- GROUP_RE =
Group A-Z Group 1-99 Group HEX # used in concaf world cup quali Group 1A or A1, B1 - used anywhere
use "key" of group - why? why not?
%r{^ Group [ ] (?<key>[a-z0-9]+) $}ix
- ROUND_RE =
%r{^( ## add special case for group play-off rounds! ## group 2 play-off (e.g. worldcup 1954, 1958) ## ### note - allow Group ("stand-alone") as "generic" round for now ## BUT do NOT allow Group 1, Group 2, Group A, Group B, etc. (?: Group [ ] [A-Z0-9]+ [ ] Play-?offs? | Group (?: [ ] phase)? | League (?: [ ] phase)? ) | # round - note - requiers number e.g. round 1,2, etc. # note - use 1-9 regex (cannot start with 0) - why? why not? # make week 01 or round 01 or matchday 01 possible? (?: (?: Round | Matchday | Week ) [ ] [1-9][0-9]* ) | ## starting with qual(ification) ## Qual. Round 1 / Qual. Round 2 / Qual. Round 3 ## or ## Playoff Round 1 ## Play-in Round 1 (?: (?: Qual \. | Play-?off | Play-?in ) [ ] Round [ ] [1-9][0-9]* ) | ## 1. Round / 2. Round / 3. Round / etc. ## First Round ## Play-off Round ## Final Round (e.g. Worldcup 1950) (?: (?: [1-9][0-9]* \. | 1st | First | 2nd | Second | Play-?off | Final ) [ ] Round ) | ## starting with preliminary # e.g. Preliminary round (?: Preliminary [ ] (?: Round | Semi-?finals | Final ) ) | # more (kockout) rounds # playoffs - playoff, play-off, play-offs (?: Play-?offs? (?: [ ]for[ ]quarter-?finals )? ) | # round32 (?: Round[ ]of[ ]32 | Last[ ]32 ) | # round16 (?: Round[ ]of[ ]16 | Last[ ]16 | 8th[ ]finals ) | # fifthplace (?: (?: (Fifth|5th)[ -]place (?: [ ] (?: match|play-?off|final ))? ) | (?: Match[ ]for[ ](?: fifth|5th )[ -]place ) ) | # thirdplace (?: (?: (Third|3rd)[ -]place (?: [ ] (?: match|play-?off|final ))? ) | (?: Match[ ]for[ ](?: third|3rd )[ -]place ) ) | # quarterfinals (?: Quarter-?finals? | Quarters | Last[ ]8 ) | # semifinals (?: Semi-?finals? | Semis | Last[ ]4 ) | # final Finals? | # decider e.g. Entscheidungsspiel Decider | ## add replays ## e.g. Final Replay ## Quarter-finals replays ## First round replays (?: (?: First [ ] Round | Quarter-?finals? | Finals? ) [ ] Replays? ) | ## more (?: Reclassification ) )$}ix
- LEG_RE =
keep leg separate (from round) - why? why not?
%r{^ # leg1 (?: 1st|First)[ ]leg | # leg2 (?: 2nd|Second)[ ]leg $}ix
- TIME_RE =
keep 18h30 - why? why not?
add support for 6:30pm 8:20am etc. - why? why not?
%r{ ## e.g. 18.30 (or 18:30 or 18h30) (?<time> \b (?<hour>\d{1,2}) (?: :|\.|h ) (?<minute>\d{2}) \b ) }ix
- TIMEZONE_RE =
for timezone format use for now: (BRT/UTC-3) (e.g. brazil time)
(CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST).
UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000
- allow +01 or +0100 - why? why not - +0130 (01:30)
see
https://en.wikipedia.org/wiki/Time_zone https://en.wikipedia.org/wiki/List_of_UTC_offsets https://en.wikipedia.org/wiki/UTC−04:00 etc.
%r{ ## e.g. (UTC-2) or (CEST/UTC-2) etc. (?<timezone> \( ## optional "local" timezone name eg. BRT or CEST etc. (?: [a-z]+ / )? [a-z]+ [+-] \d{1,4} ## e.g. 0 or 00 or 0000 \) ) }ix
- BASICS_RE =
%r{ ## e.g. (51) or (1) etc. - limit digits of number??? (?<num> \( (?<value>\d+) \) ) | (?<vs> (?<=[ ]) # Positive lookbehind for space (?: vs\.?| ## allow optional dot (eg. vs. v.) v\.?| - ) # not bigger match first e.g. vs than v etc. (?=[ ]) # positive lookahead for space ) | (?<none> (?<=[ \[]|^) # Positive lookbehind for space or [ - (?=[ ]*;) # positive lookahead for space ) | (?<spaces> [ ]{2,}) | (?<space> [ ]) | (?<sym>[;,@|\[\]]) }ix
- MINUTE_RE =
%r{ (?<minute> (?<=[ ]) # Positive lookbehind for space required (?<value>\d{1,3}) ## constrain numbers to 0 to 999!!! (?: \+ (?<value2>\d{1,3}) )? ' ## must have minute marker!!!! ) }ix
- GOAL_PEN_RE =
goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)
%r{ (?<pen> \( (?:pen|p)\.? \) ) }ix
- GOAL_OG_RE =
%r{ (?<og> \( (?:og|o\.g\.) \) ) }ix
- RE =
Regexp.union( STATUS_RE, TIMEZONE_RE, TIME_RE, DURATION_RE, # note - duration MUST match before date DATE_RE, SCORE_RE, BASICS_RE, MINUTE_RE, GOAL_OG_RE, GOAL_PEN_RE, TEXT_RE )
- MONTH_LINES =
parse_names( <<TXT ) January Jan February Feb March Mar April Apr May June Jun July Jul August Aug September Sept Sep October Oct November Nov December Dec TXT
- MONTH_NAMES =
build_names( MONTH_LINES )
- MONTH_MAP =
pp MONTH_NAMES
build_map( MONTH_LINES, downcase: true )
- DAY_LINES =
parse_names( <<TXT ) Monday Mon Mo Tuesday Tues Tue Tu Wednesday Wed We Thursday Thurs Thur Thu Th Friday Fri Fr Saturday Sat Sa Sunday Sun Su TXT
- DAY_NAMES =
build_names( DAY_LINES )
- DAY_MAP =
pp DAY_NAMES
build_map( DAY_LINES, downcase: true )
- DATE_I_RE =
e.g. Fri Aug/9 or Fri Aug 9
%r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) [ ] )? (?<month_name>#{MONTH_NAMES}) (?: \/|[ ] ) (?<day>\d{1,2}) ## optional year ( [ ] (?<year>\d{4}) )? \b )}ix
- DATE_II_RE =
e.g. 3 June or 10 June
%r{ (?<date> \b ## optional day name ((?<day_name>#{DAY_NAMES}) [ ] )? (?<day>\d{1,2}) [ ] (?<month_name>#{MONTH_NAMES}) ## optional year ( [ ] (?<year>\d{4}) )? \b )}ix
- DATE_RE =
map tables
note: order matters; first come-first matched/served
Regexp.union( DATE_I_RE, DATE_II_RE )
- DURATION_I_RE =
todo add plus later on - why? why not?
%r{ (?<duration> \b ## optional day name ((?<day_name1>#{DAY_NAMES}) [ ] )? (?<month_name1>#{MONTH_NAMES}) (?: \/|[ ] ) (?<day1>\d{1,2}) ## optional year ( [ ] (?<year1>\d{4}) )? ## support + and - (add .. or such - why??) [ ]*[-][ ]* ## optional day name ((?<day_name2>#{DAY_NAMES}) [ ] )? (?<month_name2>#{MONTH_NAMES}) (?: \/|[ ] ) (?<day2>\d{1,2}) ## optional year ( [ ] (?<year2>\d{4}) )? \b )}ix
- DURATION_II_RE =
variant ii e.g. 26 July - 27 July
%r{ (?<duration> \b ## optional day name ((?<day_name1>#{DAY_NAMES}) [ ] )? (?<day1>\d{1,2}) [ ] (?<month_name1>#{MONTH_NAMES}) ## optional year ( [ ] (?<year1>\d{4}) )? ## support + and - (add .. or such - why??) [ ]*[-][ ]* ## optional day name ((?<day_name2>#{DAY_NAMES}) [ ] )? (?<day2>\d{1,2}) [ ] (?<month_name2>#{MONTH_NAMES}) ## optional year ( [ ] (?<year2>\d{4}) )? \b )}ix
- DURATION_RE =
map tables
note: order matters; first come-first matched/served
Regexp.union( DURATION_I_RE, DURATION_II_RE )
- TEXT_RE =
%r{ ## must start with alpha (allow unicode letters!!) (?<text> ## positive lookbehind ## (MUST be fixed number of chars - no quantifier e.g. +? etc.) (?<=[ ,;@|\[\]] |^ ) (?: # opt 1 - start with alpha \p{L}+ ## all unicode letters (e.g. [a-z]) | # opt 2 - start with num!! - allow special case (e.g. 1. FC) \d+ # check for num lookahead (MUST be space or dot) ## MUST be followed by (optional dot) and ## required space !!! ## MUST be follow by a to z!!!! \.? ## optional dot [ ]? ## make space optional too - why? why not? ## yes - eg. 1st, 2nd, 5th etc. \p{L}+ | ## opt 3 - add weirdo case ## e.g. 5.-8. Platz Playoffs - keep - why? why not? \d+\.-\d+\. [ ]? \p{L}+ ) (?:(?: (?:[ ] (?!vs?\.?[ ]) ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ]) ) | # only single spaces allowed inline!!! [-] )? (?: \p{L} | [&/'°] | (?: \d+ (?! [0-9h'+-] | ## protected break on 12h / 12' / 1-1 ## check usege for 3+4 - possible? where ? why? (?:[.:]\d) ## protected/exclude/break on 12.03 / 12:03 ) ## negative lookahead for numbers ## note - include digits itself!!! ## note - remove / (slash) e.g. allows UDI'19/Beter Bed )| \. ) )* ## must NOT end with space or dash(-) ## todo/fix - possible in regex here ## only end in alphanum a-z0-9 (not dot or & ???) ## allow optional at the end ## tag or year ## make it and in the future - why? why not? ## ## change - fix ## do NOT use (A) for amateur ## use A or A. with NO ()!!! ## (A) - allow with predined alpha only for now ## e.g. (A) - amateur a team or b? ### same for U21 or U9 etc ## use with NO ()!!! - why? why not? ## or U21 U9 etc. - why? why not? ## or etc. ## (1879-1893) or allow years e.g. (1879-1893) ### ## add allow country code three to five letters for now ## change to generic 1 to 5 - why? why not? ## e.g. (A), (I), ## (AUT) ## (TRNC) five? for UEFA code for northern cyprus ## change to 1 to 4 - why? why not? ## check - fix possible for upper case only here ## inline for this group only? (?: [ ] \( \d{4}-\d{4} \) )? (?: [ ]+ ## allow more than once space - why? why not? \( (?: [A-Z]{1,5} ) \) )? ## add lookahead/lookbehind ## must be space!!! ## (or comma or start/end of string) ## kind of \b !!! ## positive lookahead (?=[ ,;@|\[\]] |$ ) ) }ix
- P_EN =
english helpers (penalty, extra time, …)
note - p must go last (shortest match) pso = penalty shootout
'(?: pso | pen\.? | p\.? )'
- ET_EN =
e.g. p., p, pen, pen., PSO, etc.
'(?: aet | a\.e\.t\.? )'
- SCORE__P_ET__RE =
note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.)
3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t.
%r{ (?<score> \b (?: (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} [ ]+ )? # note: make penalty (P) score optional for now (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]* #{ET_EN} (?=[ \]]|$) )}ix
- SCORE__P__RE =
note: allow SPECIAL with penalty only
3-4 pen.
%r{ (?<score> \b (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} (?=[ \]]|$) )}ix
- SCORE__P_ET_FT_HT__RE =
e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or
3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1)
%r{ (?<score> \b (?: (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} [ ]+ )? # note: make penalty (P) score optional for now (?<et1>\d{1,2}) - (?<et2>\d{1,2}) [ ]* #{ET_EN} [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) (?=[ \]]|$) )}ix
- SCORE__P_FT_HT__RE =
special case for case WITHOUT extra time!!
same as above (but WITHOUT extra time and pen required)
%r{ (?<score> \b (?<p1>\d{1,2}) - (?<p2>\d{1,2}) [ ]* #{P_EN} [ ]+ \( [ ]* (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) [ ]* (?: , [ ]* (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* )? )? # note: make half time (HT) score optional for now \) (?=[ \]]|$) )}ix
- SCORE__FT_HT__RE =
e.g. 2-1 (1-1) or
2-1
%r{ (?<score> \b (?<ft1>\d{1,2}) - (?<ft2>\d{1,2}) (?: [ ]+ \( [ ]* (?<ht1>\d{1,2}) - (?<ht2>\d{1,2}) [ ]* \) )? # note: make half time (HT) score optional for now (?=[ \]]|$) )}ix
- SCORE_RE =
map tables
note: order matters; first come-first matched/served
Regexp.union( SCORE__P_ET_FT_HT__RE, # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0) SCORE__P_FT_HT__RE, # e.g. 5-1 pen. (1-1) SCORE__P_ET__RE, # e.g. 2-2 a.e.t. or 5-1 pen. 2-2 a.e.t. SCORE__P__RE, # e.g. 5-1 pen. SCORE__FT_HT__RE, # e.g. 1-1 (1-0) or 1-1 -- note - must go last!!! )
Class Method Summary collapse
- .build_map(lines, downcase: false) ⇒ Object
- .build_names(lines) ⇒ Object
- .more_round_names ⇒ Object
-
.parse_date(str, start:) ⇒ Object
add a date parser helper.
- .parse_names(txt) ⇒ Object
-
.read_names(path) ⇒ Object
add more round names in different languages via txt files.
Instance Method Summary collapse
- #is_group?(text) ⇒ Boolean
-
#is_leg?(text) ⇒ Boolean
Pair matches/games if marked with leg1 n leg2.
- #is_round?(text) ⇒ Boolean
- #log(msg) ⇒ Object
-
#parse(line, debug: false) ⇒ Object
convience helper - ignore errors by default.
- #parse_with_errors(line, debug: false) ⇒ Object
-
#tokenize(line, typed: false, debug: false) ⇒ Object
convience helper - ignore errors by default.
- #tokenize_with_errors(line, typed: false, debug: false) ⇒ Object
Class Method Details
.build_map(lines, downcase: false) ⇒ Object
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/sportdb/parser/token-date.rb', line 40 def self.build_map( lines, downcase: false ) ## note: downcase name!!! ## build a lookup map that maps the word to the index (line no) plus 1 e.g. ## {"january" => 1, "jan" => 1, ## "february" => 2, "feb" => 2, ## "march" => 3, "mar" => 3, ## "april" => 4, "apr" => 4, ## "may" => 5, ## "june" => 6, "jun" => 6, ... lines.each_with_index.reduce( {} ) do |h,(line,i)| line.each do |name| h[ downcase ? name.downcase : name ] = i+1 end ## note: start mapping with 1 (and NOT zero-based, that is, 0) h end end |
.build_names(lines) ⇒ Object
33 34 35 36 37 |
# File 'lib/sportdb/parser/token-date.rb', line 33 def self.build_names( lines ) ## join all words together into a single string e.g. ## January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|... lines.map { |line| line.join('|') }.join('|') end |
.more_round_names ⇒ Object
180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/sportdb/parser/lang.rb', line 180 def self.more_round_names @more_round_name ||= begin names = [] langs = ['en', 'de', 'es', 'pt', 'misc'] ## sort names by length?? langs.each do |lang| path = "#{SportDb::Module::Parser.root}/config/rounds_#{lang}.txt" names += read_names( path ) end names end end |
.parse_date(str, start:) ⇒ Object
add a date parser helper
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/sportdb/parser/token-date.rb', line 160 def self.parse_date( str, start: ) if m=DATE_RE.match( str ) year = m[:year].to_i(10) if m[:year] month = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name] day = m[:day].to_i(10) if m[:day] wday = DAY_MAP[ m[:day_name].downcase ] if m[:day_name] if year.nil? ## try to calculate year year = if month > start.month || (month == start.month && day >= start.day) # assume same year as start_at event (e.g. 2013 for 2013/14 season) start.year else # assume year+1 as start_at event (e.g. 2014 for 2013/14 season) start.year+1 end end Date.new( year,month,day ) else puts "!! ERROR - unexpected date format; cannot parse >#{str}<" exit 1 end end |
.parse_names(txt) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/sportdb/parser/token-date.rb', line 6 def self.parse_names( txt ) lines = [] # array of lines (with words) txt.each_line do |line| line = line.strip next if line.empty? next if line.start_with?( '#' ) ## skip comments too ## strip inline (until end-of-line) comments too ## e.g. Janvier Janv Jan ## check janv in use?? ## => Janvier Janv Jan line = line.sub( /#.*/, '' ).strip ## pp line values = line.split( /[ \t]+/ ) ## pp values ## todo/fix -- add check for duplicates lines << values end lines end |
.read_names(path) ⇒ Object
add more round names in different languages
via txt files
for now must match case - maybe make caseinsensitive later - why? why not?
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# File 'lib/sportdb/parser/lang.rb', line 158 def self.read_names( path ) txt = read_text( path ) names = [] # array of lines (with words) txt.each_line do |line| line = line.strip next if line.empty? next if line.start_with?( '#' ) ## skip comments too ## strip inline (until end-of-line) comments too ## e.g. Janvier Janv Jan ## check janv in use?? ## => Janvier Janv Jan line = line.sub( /#.*/, '' ).strip ## pp line names << line end names end |
Instance Method Details
#is_group?(text) ⇒ Boolean
20 21 22 23 |
# File 'lib/sportdb/parser/lang.rb', line 20 def is_group?( text ) ## use regex for match GROUP_RE.match?( text ) end |
#is_leg?(text) ⇒ Boolean
Pair matches/games if marked with leg1 n leg2
211 212 213 |
# File 'lib/sportdb/parser/lang.rb', line 211 def is_leg?( text ) LEG_RE.match?( text ) end |
#is_round?(text) ⇒ Boolean
194 195 196 197 |
# File 'lib/sportdb/parser/lang.rb', line 194 def is_round?( text ) ROUND_RE.match?( text ) || self.class.more_round_names.include?( text ) end |
#log(msg) ⇒ Object
133 134 135 136 137 138 139 140 |
# File 'lib/sportdb/parser/token.rb', line 133 def log( msg ) ## append msg to ./logs.txt ## use ./errors.txt - why? why not? File.open( './logs.txt', 'a:utf-8' ) do |f| f.write( msg ) f.write( "\n" ) end end |
#parse(line, debug: false) ⇒ Object
convience helper - ignore errors by default
206 207 208 209 |
# File 'lib/sportdb/parser/parser.rb', line 206 def parse( line, debug: false ) nodes, _ = parse_with_errors( line, debug: debug ) nodes end |
#parse_with_errors(line, debug: false) ⇒ Object
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# File 'lib/sportdb/parser/parser.rb', line 103 def parse_with_errors( line, debug: false ) errors = [] tokens, token_errors = tokenize_with_errors( line, typed: true ) errors += token_errors ############# ## pass 1 ## replace all texts with keyword matches (e.g. group, round, leg, etc.) tokens = tokens.map do |t| if t[0] == :text text = t[1] if is_group?( text ) [:group, text] elsif is_leg?( text ) [:leg, text] elsif is_round?( text ) [:round, text] else t ## pass through as-is (1:1) end else t end end ## puts "tokens:" ## pp tokens ## transform tokens into (parse tree/ast) nodes nodes = [] buf = Tokens.new( tokens ) ## pp buf loop do break if buf.eos? ## simplify - remove separator for round + leg pair ## e.g. Round of 16, 1st Leg ## allow Round of 16 - 1st Leg too - why? why not? if buf.match?( :round, [:',', :'|', :'-', :vs, ### fix - change parser to issue :'-' only for (-) not :vs!!! ], :leg ) nodes << [:round, buf.next[1]] buf.next ## swallow separator nodes << [:leg, buf.next[1]] next end if buf.pos == 0 ## MUST start line ## check for ## group def or round def if buf.match?( :round, :'|', [:date, :duration] ) ## assume round def (change round to round_def) nodes << [:round_def, buf.next[1]] buf.next ## swallow pipe nodes += buf.collect break end if buf.match?( :group, :'|', :text ) ## assume group def (change group to group_def) nodes << [:group_def, buf.next[1]] buf.next ## swallow pipe ## change all text to team nodes += buf.collect { |t| t[0] == :text ? [:team, t[1]] : t } break end end if buf.match?( :text, :'-', :text ) ## hacky? convert "generic" :- to :vs nodes << [:team, buf.next[1]] ## keep this rule/option - why? why not? nodes << [:vs] nodes << [:team, buf.next[1]] elsif buf.match?( :text, [:score, :vs], :text ) nodes << [:team, buf.next[1]] nodes << buf.next nodes << [:team, buf.next[1]] elsif buf.match?( :text, :minute ) nodes << [:player, buf.next[1]] nodes << buf.next elsif buf.cur == :'@' ## add all to the end as is ## only change text to geo nodes += buf.collect { |t| t[0] == :text ? [:geo, t[1]] : t } break else ## pass through nodes << buf.next end end [nodes,errors] end |
#tokenize(line, typed: false, debug: false) ⇒ Object
convience helper - ignore errors by default
345 346 347 348 349 350 |
# File 'lib/sportdb/parser/token.rb', line 345 def tokenize( line, typed: false, debug: false ) tokens, _ = tokenize_with_errors( line, typed: typed, debug: debug ) tokens end |
#tokenize_with_errors(line, typed: false, debug: false) ⇒ Object
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 |
# File 'lib/sportdb/parser/token.rb', line 144 def tokenize_with_errors( line, typed: false, debug: false ) tokens = [] errors = [] ## keep a list of errors - why? why not? puts ">#{line}<" if debug pos = 0 ## track last offsets - to report error on no match ## or no match in end of string offsets = [0,0] m = nil while m = RE.match( line, pos ) if debug pp m puts "pos: #{pos}" end offsets = [m.begin(0), m.end(0)] if offsets[0] != pos ## match NOT starting at start/begin position!!! ## report parse error!!! msg = "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<" puts msg errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}" log( msg ) end ## ## todo/fix - also check if possible ## if no match but not yet end off string!!!! ## report skipped text run too!!! pos = offsets[1] pp offsets if debug t = if m[:space] ## skip space nil elsif m[:spaces] ## skip spaces nil elsif m[:text] [:text, m[:text]] ## keep pos - why? why not? elsif m[:status] ## (match) status e.g. cancelled, awarded, etc. if m[:status_note] ## includes note? e.g. awarded; originally 2-0 [:status, m[:status], {note:m[:status_note]}] else [:status, m[:status]] end elsif m[:time] if typed ## unify to iso-format ### 12.40 => 12:40 ## 12h40 => 12:40 etc. ## keep string (no time-only type in ruby) hour = m[:hour].to_i(10) ## allow 08/07/etc. minute = m[:minute].to_i(10) ## check if valid - 0:00 - 24:00 ## check if 24:00 possible? or only 0:00 (23:59) if (hour >= 0 && hour <= 24) && (minute >=0 && minute <= 59) ## note - for debugging keep (pass along) "literal" time ## might use/add support for am/pm later [:time, m[:time], {h:hour,m:minute}] else raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range" end else [:time, m[:time]] end elsif m[:date] if typed date = {} =begin ((?<day_name>#{DAY_NAMES}) [ ] )? (?<month_name>#{MONTH_NAMES}) (?: \/|[ ] ) (?<day>\d{1,2}) ## optional year ( [ ] (?<year>\d{4}) )? =end ## map month names ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup date[:y] = m[:year].to_i(10) if m[:year] date[:m] = MONTH_MAP[ m[:month_name].downcase ] if m[:month_name] date[:d] = m[:day].to_i(10) if m[:day] date[:wday] = DAY_MAP[ m[:day_name].downcase ] if m[:day_name] ## note - for debugging keep (pass along) "literal" date [:date, m[:date], date] else [:date, m[:date]] end elsif m[:timezone] [:timezone, m[:timezone]] elsif m[:duration] if typed duration = { start: {}, end: {}} duration[:start][:y] = m[:year1].to_i(10) if m[:year1] duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ] if m[:month_name1] duration[:start][:d] = m[:day1].to_i(10) if m[:day1] duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ] if m[:day_name1] duration[:end][:y] = m[:year2].to_i(10) if m[:year2] duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ] if m[:month_name2] duration[:end][:d] = m[:day2].to_i(10) if m[:day2] duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ] if m[:day_name2] ## note - for debugging keep (pass along) "literal" duration [:duration, m[:duration], duration] else [:duration, m[:duration]] end elsif m[:num] if typed ## note - strip enclosing () and convert to integer [:num, m[:value].to_i(10)] else [:num, m[:num]] end elsif m[:score] if typed score = {} ## check for pen score[:p] = [m[:p1].to_i(10), m[:p2].to_i(10)] if m[:p1] && m[:p2] score[:et] = [m[:et1].to_i(10), m[:et2].to_i(10)] if m[:et1] && m[:et2] score[:ft] = [m[:ft1].to_i(10), m[:ft2].to_i(10)] if m[:ft1] && m[:ft2] score[:ht] = [m[:ht1].to_i(10), m[:ht2].to_i(10)] if m[:ht1] && m[:ht2] ## note - for debugging keep (pass along) "literal" score [:score, m[:score], score] else [:score, m[:score]] end elsif m[:minute] if typed minute = {} minute[:m] = m[:value].to_i(10) minute[:offset] = m[:value2].to_i(10) if m[:value2] ## note - for debugging keep (pass along) "literal" minute [:minute, m[:minute], minute] else [:minute, m[:minute]] end elsif m[:og] typed ? [:og] : [:og, m[:og]] ## for typed drop - string version/variants elsif m[:pen] typed ? [:pen] : [:pen, m[:pen]] elsif m[:vs] typed ? [:vs] : [:vs, m[:vs]] elsif m[:none] typed ? [:none] : [:none, m[:none]] elsif m[:sym] sym = m[:sym] ## return symbols "inline" as is - why? why not? case sym when ',' then [:','] when ';' then [:';'] when '@' then [:'@'] when '|' then [:'|'] else nil ## ignore others (e.g. brackets []) end else ## report error nil end tokens << t if t if debug print ">" print "*" * pos puts "#{line[pos..-1]}<" end end ## check if no match in end of string if offsets[1] != line.size msg = "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<" puts msg log( msg ) errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}" end [tokens,errors] end |