Class: Rsssf::Parser
- Inherits:
-
Object
- Object
- Rsssf::Parser
- Defined in:
- lib/rsssf/parser/token.rb,
lib/rsssf/parser/linter.rb,
lib/rsssf/parser/parser.rb,
lib/rsssf/parser/token-date.rb,
lib/rsssf/parser/token-note.rb,
lib/rsssf/parser/token-text.rb,
lib/rsssf/parser/token-goals.rb,
lib/rsssf/parser/token-round.rb,
lib/rsssf/parser/token-score.rb
Defined Under Namespace
Classes: Linter
Constant Summary collapse
- BASICS_RE =
%r{ (?<spaces> [ ]{2,}) | (?<space> [ ]) | (?<sym>[;,@|\[\]\(\)]) ## note - add () too - why? why not? }ix
- VS_RE =
%r{ ## must be space before and after!!! (?<vs> (?<=[ ]) # Positive lookbehind for space - (?=[ ]) # positive lookahead for space ) }ix
- RE =
Regexp.union( GROUP_RE, ROUND_RE, LEG_RE, DATE_RE, VS_RE, SCORE_RE, SCORE_AWD_RE, SCORE_ABD_RE, SCORE_PPD_RE, SCORE_NP_RE, SCORE_WO_RE, SCORE_EXT_RE, NOTE_RE, BASICS_RE, TEXT_RE )
- HYPHEN_RE =
rename to dash or to ???
used to add/allow hyphen/dash (-) in INSIDE_RE
%r{ ## must be space before and after (or end of line)!!! ## note - uses SYM capture (?<sym> (?<=[ ]) # Positive lookbehind for space - (?=[ ]|$) # positive lookahead for space ) }ix
- SCORE_AT_RE =
rename to ?? use SCORE_AT for now - why? why not?
add support for score at/score points/markers e.g. [1-0 Andrei 08, 1-1 Rydlewicz 24, 1-2 Prica 85, 2-2 Bella 88, 2-3 Arvidsson 102]
%r{ (?<score_at> \b \d{1,2}-\d{1,2} \b ) }ix
- INSIDE_RE =
“strict” text match mode inside brackets
]
Regexp.union( SCORE_AT_RE, GOAL_OG_RE, GOAL_PEN_RE, BASICS_RE, HYPHEN_RE, TEXT_STRICT_RE, MINUTE_RE, )
- SYM_CLOSE =
open/close pairs - lookup close (by open char)
{ '(' => ')', '[' => ']', }
- MONTH_LINES =
note - support only 5 letter max for now
now January|February|August etc.
SportDb::Parser.parse_names( <<TXT ) Jan Feb March Mar April Apr May June Jun July Jul Aug Sept Sep Oct Nov Dec TXT
- MONTH_NAMES =
SportDb::Parser.build_names( MONTH_LINES )
- MONTH_MAP =
pp MONTH_NAMES
SportDb::Parser.build_map( MONTH_LINES, downcase: true )
- DAY_LINES =
nnote - only support two or three letters
no Tues | Thur | Thurs | Sunday etc.
SportDb::Parser.parse_names( <<TXT ) Mon Mo Tue Tu Wed We Thu Th Fri Fr Sat Sa Sun Su TXT
- DAY_NAMES =
SportDb::Parser.build_names( DAY_LINES )
- DAY_MAP =
pp DAY_NAMES
SportDb::Parser.build_map( DAY_LINES, downcase: true )
- DATE_RE =
e.g. Fri Aug 9
%r{ ## note - do not include [] in capture for now - why? why not ## eat-up/consume optional [] - part i (?: \[ | \b ) (?<date> (?: ###### ## variant I/1/one ### Fri June 24 ## optional day name ((?<day_name>#{DAY_NAMES}) [ ] )? ## allow 1 or 2 spaces e.g. Jul 2 / Jun 27 to pretty print (?<month_name>#{MONTH_NAMES}) [ ]{1,2} (?<day>\d{1,2}) ## optional year ( [ ] (?<year>\d{4}) )? ) | (?: #### ## variant II/2/two ## 17- 3-22 - allow space befor mont ## 17-3-22 \d{1,2} - [ ]*\d{1,2} - (?: \d{4} | ## 2024 \d{2} ## or 24 only ) ) ) ## end date capture ## eat-up/consume optional [] - part ii (?: \] | \b ) }ix
- NOTE_BASICS_RE =
move to token-note(s) file !!!!
%r{ (?<note_open> \[ ) (?<note> (?: ## starting with ___ PLUS requiring more text (?: nb: ## e.g. [NB: between top-8 of regular season] # [NB: América, Morelia and Tigres qualified on better record regular season] # [NB: Celaya qualified on away goals] # [NB: Alebrijes qualified on away goal] # [NB: Leones Negros qualified on away goals] # # todo/fix: # add "top-level" NB: version ## with full (end-of) line note - why? why not? | (?: originally[ ])? scheduled ## e.g. [originally scheduled to play in Mexico City] | rescheduled ## e.g. [Rescheduled due to earthquake occurred in Mexico on September 19] | remaining ## e.g. [remaining 79'] ## [remaining 84'] ## [remaining 59'] ## [remaining 5'] | played ## e.g. [played in Macaé-RJ] ## [played in Caxias do Sul-RS] ## [played in Sete Lagoas-MG] ## [played in Uberlândia-MG] ## [played in Brasília-DF] ## [played in Vöcklabruck] ## [played in Pasching] | declared ## e.g. [declared void] | inter-group ## e.g. [inter-group A-B] ## [inter-group C-D] ) [ ] [^\]]+? ## slurp all to next ] - (use non-greedy) ) | (?: ## starting with in - do NOT allow digits ## name starting with in possible - why? why not? in[ ] [^0-9\]]+? ## e.g. [In Estadio La Corregidora] ## [in Unidad Deportiva Centenario] ## [in Estadio Olímpico Universitario] ## [in Estadio Victoria] ## [in UD José Brindis] ## [in Colomos Alfredo "Pistache" Torres stadium] ) | (?: (?: postponed ## e.g. [postponed due to problems with the screen of the stadium] ## [postponed by storm] ## [postponed due to tropical storm "Hanna"] ## [postponed from Sep 10-12 due to death Queen Elizabeth II] ## [postponed] -- include why? why not? | awarded ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2] ## [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0] ## [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)] | abandoned ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer] ## [abandoned at 0-0 in 6' due to waterlogged pitch] ## [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood] ## [abandoned at 1-0 in 31'] ## [abandoned at 0-1' in 85 due to crowd trouble] | suspended ## e.g. [suspended at 0-0 in 12' due to storm] ## [suspended at 84' by storm; result stood] | annulled ## e.g. [annulled] | replay ## e.g. [replay] | verified ## e.g. [verified 2:0 wo.] ) ([ ] ## note - optional text [^\]]+? )? ## slurp all to next ] - (use non-greedy) ) ) # note capture (?: (?<note_close> \] ) | $ ## note - allow open notes (that continue on next line) ) }ix
- NOTE_MORE_RE =
%r{ (?<=[ ]) ## one (leading) space min. required (?<note_cont> [⮑…] | \.{2,3} ### .. or ... ) [ ]* (?<note> [^\]]+? ## non-greeedy ) (?: (?<note_close> \] ) | $ ## note - allow open notes (that continue on next line) ) }ix
- NOTE_RE =
Regexp.union( NOTE_BASICS_RE, NOTE_MORE_RE, )
- TEXT_QUOTED =
simple (double) quoted text
only supports a-z (unicode) PLUS (single) inline space add more chars - why? why not?
'(?: " ' + ' \p{L}+ ' + ' (?: [ ] ' + ' \p{L}+ )* ' + ' " ) '
- TEXT_STRICT_RE =
%r{ (?<text> (?: \b | #{TEXT_QUOTED} [ ] ## note - leading quoted text must be followed by space!! ) \p{L}+ ## all unicode letters (e.g. [a-z]) (?: (?:[ ] | # only single spaces allowed inline!!! [-] )? (?: \p{L}+ | ['.] | (?: (?<= [ ]) #{TEXT_QUOTED} (?= [ ]|$) ### must be followed by space ## todo/fix - add all end of text lookaheads to (see below) ) ) )* ## must NOT end with space or dash(-) ## todo/fix - possible in regex here ## only end in alphanum a-z0-9 (not dot or & ???) ## positive lookahead ## cannot use \b if text ends in dot (.) or other non-alphnum ## than \b will not work ## not - add () too for now - why? why not? (?=[ ,;@|\[\]\(\)] |$ ) ) }ix
- TEXT_RE =
%r{ ## must start with alpha (allow unicode letters!!) (?<text> \b ## use/require word boundary (?: # opt 1 - start with alpha \p{L}+ ## all unicode letters (e.g. [a-z]) | # opt 2 - start with num!! - allow special case (e.g. 1. FC) \d+ # check for num lookahead (MUST be space or dot) ## MUST be followed by (optional dot) and ## required space !!! ## MUST be follow by a to z!!!! \.? ## optional dot [ ]? ## make space optional too - why? why not? ## yes - eg. 1st, 2nd, 5th etc. \p{L}+ ) (?:(?: (?:[ ] (?! (awd|abd|ppd|n/p|w/o)[ ]) ## note - exclude (awd[ ]/abd[ ]/n/p[ ]) ) | # only single spaces allowed inline!!! [-] )? (?: \p{L}+ | [&/'.] | (?: \d+ (?![0-9.:'/+-]) ## negative lookahead for numbers ## note - include digits itself!!! ) ) )* ## must NOT end with space or dash(-) ## todo/fix - possible in regex here ## only end in alphanum a-z0-9 (not dot or & ???) ## support (Hamburg) or such at the end (ony) ## note - no numbers allowed inside () for now!! (?: [ ]\(\p{L}+ (?: (?: [ ] | [-] )? \p{L}+ | [&/'.] )* \) )? ## add lookahead/lookbehind ## must be space!!! ## (or comma or start/end of string) ## kind of \b !!! ## positive lookahead ## note - added : too - why? why not? (?=[ ,;@|:\[\]] |$ ) ) }ix
- MINUTE_RE =
%r{ (?<minute> \b \d{1,3} '? ## optional minute quote (') (?: # optional offset/extra e.g. 45+ / 90+ or 45+10 / 90+5 (?: \+ (?: (?! [0-9]) ## negative look ahead (not a number) required | (?: \d{1,3} '? ## optional minute quote (') (?= (og|o|pen|p)? ([ ;,\]\)]|$)) ) ) ) | (?= (og|o|pen|p)? ([ ;,\]\)]|$)) # note - break can be og|pen|p too ) )}ix
- GOAL_PEN_RE =
goal types
%r{ (?<pen> (?<=\d|\+|[ ]|') ## must follow a number or plus (e.g. 45p / 45+p / 45 p / 45'p) or space (?: pen|p ) \b ) }ix
- GOAL_OG_RE =
%r{ (?<og> (?<=\d|\+|[ ]|') ## must follow a number or plus (e.g. 45og / 45+og / 45 og) or space (?: og|o ) \b ) }ix
- GROUP_RE =
Group A-Z Group 1-99 Group HEX # used in concaf world cup quali Group 1A or A1, B1 - used anywhere
use "key" of group - why? why not?
%r{(?<group> \b Group [ ] [a-z0-9]+ \b)}ix
- ROUND_RE =
%r{(?<round> \b (?: # round - note - requiers number e.g. round 1,2, etc. (?: (?: Round | Matchday | Week ) [ ] [0-9]+ ) | # more (kockout) rounds # playoffs - playoff, play-off, play-offs (?: Play-?offs? (?: [ ]for[ ]quarter-?finals )? ) | # round32 (?: Round[ ]of[ ]32 | Last[ ]32 | 16th[ ]finals | 1/16[ ]finals ) | # round16 (?: Round[ ]of[ ]16 | Last[ ]16 | 8th[ ]finals | 1/8[ ]finals ) | # fifthplace (?: (?: (Fifth|5th)[ -]place (?: [ ] (?: match|play-?off|final ))? ) | (?: Match[ ]for[ ](?: fifth|5th )[ -]place ) ) | # thirdplace (?: (?: (Third|3rd)[ -]place (?: [ ] (?: match|play-?off|final ))? ) | (?: Match[ ]for[ ](?: third|3rd )[ -]place ) ) | # quarterfinals (?: Quarter-?finals? | Quarters | Last[ ]8 ) | # semifinals (?: Semi-?finals? | Semis | Last[ ]4 ) | # final Finals? ) \b)}ix
- LEG_RE =
keep leg separate (from round) - why? why not?
%r{ (?<leg> \b (?: # leg1 (?: 1st|First)[ ]legs? | # leg2 (?: 2nd|Second)[ ]legs? ) \b)}ix
- SCORE_RE =
e.g. 2-1
%r{ (?<score> (?<=[ ]) # Positive lookbehind for space (?<score1>\d{1,2}) - (?<score2>\d{1,2}) (?=[ ]) # positive lookahead for space ) }ix
- SCORE_EXT_RE =
%r{ \[ (?<score_ext> (?: ## aet only e.g. aet aet (?: ## optional pen [,;][ ]* \d{1,2}-\d{1,2} [ ]? pen\.? )? ) | (?: ## penalty only e.g. 3-2 pen \d{1,2}-\d{1,2} [ ]? pen\.? ) ) \] }ix
- SCORE_AWD_RE =
awd - awarded
%r{ ## must be space before and after!!! (?<score_awd> (?<=[ ]) # Positive lookbehind for space awd (?=[ ]) # positive lookahead for space ) }ix
- SCORE_ABD_RE =
abd - abandoned
%r{ ## must be space before and after!!! (?<score_abd> (?<=[ ]) # Positive lookbehind for space abd (?=[ ]) # positive lookahead for space ) }ix
- SCORE_PPD_RE =
ppd - postponed
%r{ ## must be space before and after!!! (?<score_ppd> (?<=[ ]) # Positive lookbehind for space ppd (?=[ ]) # positive lookahead for space ) }ix
- SCORE_NP_RE =
n/p - not played
%r{ ## must be space before and after!!! (?<score_np> (?<=[ ]) # Positive lookbehind for space n/p (?=[ ]) # positive lookahead for space ) }ix
- SCORE_WO_RE =
A walkover, also W.O. or w/o (originally two words: “walk over”),
is awarded to the opposing team/player etc,
if there are no other players available, or they have been disqualified, because the other contestants have forfeited or the other contestants have withdrawn from the contest.
w/o - walk over
%r{ ## must be space before and after!!! (?<score_wo> (?<=[ ]) # Positive lookbehind for space w/o (?=[ ]) # positive lookahead for space ) }ix
Instance Method Summary collapse
- #log(msg) ⇒ Object
-
#parse(line, debug: false) ⇒ Object
convience helper - ignore errors by default.
- #parse_with_errors(line, debug: false) ⇒ Object
-
#tokenize(line, debug: false) ⇒ Object
convience helper - ignore errors by default.
- #tokenize_with_errors(line, debug: false) ⇒ Object
Instance Method Details
#log(msg) ⇒ Object
76 77 78 79 80 81 82 83 |
# File 'lib/rsssf/parser/token.rb', line 76 def log( msg ) ## append msg to ./logs.txt ## use ./errors.txt - why? why not? File.open( './logs.txt', 'a:utf-8' ) do |f| f.write( msg ) f.write( "\n" ) end end |
#parse(line, debug: false) ⇒ Object
convience helper - ignore errors by default
92 93 94 95 |
# File 'lib/rsssf/parser/parser.rb', line 92 def parse( line, debug: false ) nodes, _ = parse_with_errors( line, debug: debug ) nodes end |
#parse_with_errors(line, debug: false) ⇒ Object
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/rsssf/parser/parser.rb', line 20 def parse_with_errors( line, debug: false ) errors = [] tokens, token_errors = tokenize_with_errors( line ) errors += token_errors =begin ############# ## pass 1 ## replace all texts with keyword matches (e.g. group, round, leg, etc.) tokens = tokens.map do |t| if t[0] == :text text = t[1] if is_group?( text ) ### expects to be followed by num (or text ABC??) [:group, text] elsif is_matchday?( text ) ### expects to be followed by num ## use different name e.g. :fix_round or such? [:matchday, text] elsif is_leg?( text ) [:leg, text] elsif is_round?( text ) [:round, text] else t ## pass through as-is (1:1) end else t end end ## puts "tokens:" ## pp tokens =end ## transform tokens into (parse tree/ast) nodes nodes = [] ## note - (re)use token buffer from "standard" parser here !!!! buf = SportDb::Parser::Tokens.new( tokens ) ## pp buf loop do if buf.match?( :text, [:score, :score_awd, :score_abd, :score_ppd, :score_np, :score_wo, :vs], :text ) nodes << [:team, buf.next[1]] nodes << buf.next nodes << [:team, buf.next[1]] elsif buf.match?( :text, :minute ) ## assume player+minute nodes << [:player, buf.next[1]] nodes << buf.next else ## pass through nodes << buf.next end break if buf.eos? end [nodes,errors] end |
#tokenize(line, debug: false) ⇒ Object
convience helper - ignore errors by default
281 282 283 284 |
# File 'lib/rsssf/parser/token.rb', line 281 def tokenize( line, debug: false ) tokens, _ = tokenize_with_errors( line, debug: debug ) tokens end |
#tokenize_with_errors(line, debug: false) ⇒ Object
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
# File 'lib/rsssf/parser/token.rb', line 92 def tokenize_with_errors( line, debug: false ) tokens = [] errors = [] ## keep a list of errors - why? why not? puts ">#{line}<" if debug pos = 0 ## track last offsets - to report error on no match ## or no match in end of string offsets = [0,0] m = nil #### ## quick hack - keep re state/mode between tokenize calls!!! @re ||= RE ## note - switch between RE & INSIDE_RE while m = @re.match( line, pos ) if debug pp m puts "pos: #{pos}" end offsets = [m.begin(0), m.end(0)] if offsets[0] != pos ## match NOT starting at start/begin position!!! ## report parse error!!! ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE' ## assume RE ## fix/change - use str.inspect to show tabs (\t) ## and possibly other special characters causing trouble msg = " !! WARN - parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<" puts msg errors << "parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}" log( msg ) end ## ## todo/fix - also check if possible ## if no match but not yet end off string!!!! ## report skipped text run too!!! pos = offsets[1] pp offsets if debug t = if @re == INSIDE_RE if m[:space] nil ## skip space elsif m[:spaces] nil ## skip spaces elsif m[:text] [:text, m[:text]] ## keep pos - why? why not? elsif m[:minute] [:minute, m[:minute]] elsif m[:score_at] [:score_at, m[:score_at]] elsif m[:og] [:og, m[:og]] ## for typed drop - string version/variants elsif m[:pen] [:pen, m[:pen]] elsif m[:sym] sym = m[:sym] ## return symbols "inline" as is - why? why not? case sym when ',' then [:','] when ';' then [:';'] when '@' then [:'@'] when '|' then [:'|'] when '-' then [:'-'] when '[', '(' if sym == @sym_open ## report error - already in inside mode!!! ## e.g. another [ in [] or ( in () log( "warn - unexpected (opening) #{sym} in inside (goal) mode in line >#{line}<" ) end nil when ']', ')' ## allow [] AND () for inside mode ## puts " leave inside match mode" if sym == @sym_close @re = RE @sym_open = nil ## reset sym_open/close @sym_close = nil end nil else nil ## ignore others (e.g. brackets []) end else ## report error - why? why not? nil end else ## assume standard mode/ctx if m[:space] nil ## skip space elsif m[:spaces] nil ## skip spaces elsif m[:text] [:text, m[:text]] ## keep pos - why? why not? elsif m[:note] [:note, m[:note]] elsif m[:group] [:group, m[:group]] elsif m[:round] [:round, m[:round]] elsif m[:leg] [:leg, m[:leg]] elsif m[:date] [:date, m[:date]] elsif m[:vs] [:vs, m[:vs]] elsif m[:score] [:score, m[:score]] elsif m[:score_awd] # awarded (awd) [:score_awd, m[:score_awd]] elsif m[:score_abd] # abandoned (abd) [:score_abd, m[:score_abd]] elsif m[:score_ppd] # postponed (ppd) [:score_ppd, m[:score_ppd]] elsif m[:score_np] # not played (n/p) [:score_np, m[:score_np]] elsif m[:score_wo] # walk over (w/o) [:score_wo, m[:score_wo]] elsif m[:score_ext] [:score_ext, m[:score_ext]] elsif m[:sym] sym = m[:sym] ## return symbols "inline" as is - why? why not? case sym when ',' then [:','] when ';' then [:';'] when '@' then [:'@'] when '|' then [:'|'] when '[', '(' ## switch to inside mode!!! ## puts " enter inside match mode" @re = INSIDE_RE @sym_open = sym ## record open/close style - why? why not? @sym_close = SYM_CLOSE[sym] nil when ']', ')' log( "warn - unexpected (closing) #{sym} in standard mode in line >#{line}<" ) ## already in standard mode/ctx ## report warn/error - why? why not? nil else nil ## ignore others (e.g. brackets []) end else ## report error - why? why not? nil end end tokens << t if t if debug print ">" print "*" * pos puts "#{line[pos..-1]}<" end end ## check if no match in end of string if offsets[1] != line.size ## note - report regex context ## e.g. RE or INSIDE_RE to help debugging/troubleshooting format errors ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE' ## assume RE ## fix/change - use str.inspect to show tabs (\t) ## and possibly other special characters causing trouble msg = " !! WARN - parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<" puts msg log( msg ) errors << "parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}" end [tokens,errors] end |