Class: SportDb::Parser

Inherits:

Object

Object
SportDb::Parser

show all

Defined in:: lib/sportdb/parser/lang.rb,
lib/sportdb/parser/opts.rb,
lib/sportdb/parser/token.rb,
lib/sportdb/parser/linter.rb,
lib/sportdb/parser/parser.rb,
lib/sportdb/parser/token-date.rb,
lib/sportdb/parser/token-text.rb,
lib/sportdb/parser/token-score.rb

Defined Under Namespace

Constant Summary collapse

GROUP_RE = Group A-Z Group 1-99 Group HEX # used in concaf world cup quali Group 1A or A1, B1 - used anywhere use "key" of group - why? why not?

%r{^
  Group [ ]
     (?<key>[a-z0-9]+)
$}ix

ROUND_RE =

%r{^(
   ## add special case for group play-off rounds!
   ##  group 2 play-off   (e.g. worldcup 1954, 1958)
   ##
   ### note - allow Group ("stand-alone") as "generic" round for now
   ##      BUT do NOT allow Group 1, Group 2, Group A, Group B, etc.
     (?: Group [ ] [A-Z0-9]+ [ ] Play-?offs?  |
         Group (?: [ ] phase)?  |
         League (?: [ ] phase)?
     )
        |
   # round  - note - requiers number e.g. round 1,2, etc.
   #   note - use 1-9 regex (cannot start with 0) - why? why not?
   #             make week 01 or round 01 or matchday 01 possible?
      (?: (?: Round |
              Matchday |
              Week
           )
           [ ] [1-9][0-9]*
      )
       |
   ##  starting with qual(ification)
   ##   Qual. Round 1 / Qual. Round 2 / Qual. Round 3
   ##  or
   ##  Playoff Round 1
   ##  Play-in Round 1
     (?:  (?: Qual \. |
              Play-?off |
              Play-?in
          )
           [ ] Round [ ] [1-9][0-9]* )
       |
   ## 1. Round / 2. Round / 3. Round / etc.
   ##  First Round
   ##  Play-off Round
   ##  Final Round   (e.g. Worldcup 1950)
      (?:
           (?: [1-9][0-9]* \.  |
                1st | First   |
                2nd | Second  |
                Play-?off   |
                Final
           )
             [ ] Round
       )
       |
  ## starting with preliminary
  #   e.g.  Preliminary round
     (?:  Preliminary  [ ]
           (?:  Round |
                Semi-?finals |
                Final
           )
     )
     |
   # more (kockout) rounds
   # playoffs  - playoff, play-off, play-offs
        (?: Play-?offs?
           (?: [ ]for[ ]quarter-?finals )?
        )
        |
   # round32
        (?: Round[ ]of[ ]32 |
            Last[ ]32 )
          |
   # round16
        (?: Round[ ]of[ ]16 |
            Last[ ]16 |
            8th[ ]finals )
           |
   # fifthplace
         (?:
             (?: (Fifth|5th)[ -]place
                  (?: [ ] (?: match|play-?off|final ))?
              ) |
             (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
         )
          |
   # thirdplace
          (?:
              (?: (Third|3rd)[ -]place
                     (?: [ ] (?: match|play-?off|final ))?
               ) |
              (?: Match[ ]for[ ](?: third|3rd )[ -]place )
           )
           |
   # quarterfinals
         (?:
              Quarter-?finals? |
              Quarters |
              Last[ ]8
          )
          |
   # semifinals
        (?:
             Semi-?finals? |
             Semis |
             Last[ ]4
        )
        |
   # final
         Finals?
         |
   # decider e.g. Entscheidungsspiel
         Decider
         |
    ## add replays
    ##  e.g. Final Replay
    ##       Quarter-finals replays
    ##       First round replays
     (?:
        (?: First [ ] Round |
            Quarter-?finals? |
            Finals?
         )
        [ ] Replays?
      )
     |
  ## more
     (?:
          Reclassification
     )
)$}ix

LEG_RE = keep leg separate (from round) - why? why not?

%r{^
  # leg1
     (?: 1st|First)[ ]leg
     |
  # leg2
     (?: 2nd|Second)[ ]leg
$}ix

TIME_RE = keep 18h30 - why? why not? add support for 6:30pm 8:20am etc. - why? why not?

%r{
    ## e.g. 18.30 (or 18:30 or 18h30)
    (?<time>  \b
              (?<hour>\d{1,2})
                 (?: :|\.|h )
              (?<minute>\d{2})
              \b
    )
}ix

TIMEZONE_RE = for timezone format use for now: (BRT/UTC-3) (e.g. brazil time) (CET/UTC+1) - central european time (CEST/UTC+2) - central european summer time - daylight saving time (DST). (EET/UTC+1) - eastern european time (EEST/UTC+2) - eastern european summer time - daylight saving time (DST). UTC+3 UTC+4 UTC+0 UTC+00 UTC+0000 - allow +01 or +0100 - why? why not - +0130 (01:30) see https://en.wikipedia.org/wiki/Time_zone https://en.wikipedia.org/wiki/List_of_UTC_offsets https://en.wikipedia.org/wiki/UTC−04:00 etc.

%r{
   ## e.g. (UTC-2) or (CEST/UTC-2) etc.
   (?<timezone>
      \(
           ## optional "local" timezone name eg. BRT or CEST etc.
           (?:  [a-z]+
                 /
           )?
            [a-z]+
            [+-]
            \d{1,4}   ## e.g. 0 or 00 or 0000
      \)
   )
}ix

BASICS_RE =

%r{
    ## e.g. (51) or (1) etc.  - limit digits of number???
    (?<num> \(  (?<value>\d+) \) )
       |
    (?<vs>
       (?<=[ ])	# Positive lookbehind for space
       (?:
          vs\.?|   ## allow optional dot (eg. vs. v.)
          v\.?|
          -
       )   # not bigger match first e.g. vs than v etc.
       (?=[ ])   # positive lookahead for space
    )
       |
    (?<none>
       (?<=[ \[]|^)	 # Positive lookbehind for space or [
           -
        (?=[ ]*;)   # positive lookahead for space
    )
       |
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ])
        |
    (?<sym>[;,@|\[\]])
}ix

MINUTE_RE =

%r{
     (?<minute>
       (?<=[ ])	 # Positive lookbehind for space required
           (?<value>\d{1,3})      ## constrain numbers to 0 to 999!!!
        (?: \+
            (?<value2>\d{1,3})
        )?
        '     ## must have minute marker!!!!
     )
}ix

GOAL_PEN_RE = goal types (pen.) or (pen) or (p.) or (p) (o.g.) or (og)

%r{
   (?<pen> \(
           (?:pen|p)\.?
           \)
    )
}ix

GOAL_OG_RE =

%r{
   (?<og> \(
          (?:og|o\.g\.)
          \)
   )
}ix

RE =

Regexp.union( STATUS_RE,
                    TIMEZONE_RE,
TIME_RE,
DURATION_RE,  # note - duration MUST match before date
                    DATE_RE,
                    SCORE_RE,
                    BASICS_RE, MINUTE_RE,
                    GOAL_OG_RE, GOAL_PEN_RE,
TEXT_RE )

MONTH_LINES =

parse_names( <<TXT )
January    Jan
February   Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
August     Aug
September  Sept  Sep
October    Oct
November   Nov
December   Dec
TXT

MONTH_NAMES =

build_names( MONTH_LINES )

MONTH_MAP = pp MONTH_NAMES

build_map( MONTH_LINES, downcase: true )

DAY_LINES =

parse_names( <<TXT )
Monday                   Mon  Mo
Tuesday            Tues  Tue  Tu
Wednesday                Wed  We
Thursday    Thurs  Thur  Thu  Th
Friday                   Fri  Fr
Saturday                 Sat  Sa
Sunday                   Sun  Su
TXT

DAY_NAMES =

build_names( DAY_LINES )

DAY_MAP = pp DAY_NAMES

build_map( DAY_LINES, downcase: true )

DATE_I_RE = e.g. Fri Aug/9 or Fri Aug 9

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<month_name>#{MONTH_NAMES})
         (?: \/|[ ] )
     (?<day>\d{1,2})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_II_RE = e.g. 3 June or 10 June

%r{
(?<date>
  \b
     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?
     (?<day>\d{1,2})
         [ ]
     (?<month_name>#{MONTH_NAMES})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?
  \b
)}ix

DATE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DATE_I_RE,
   DATE_II_RE
)

DURATION_I_RE = todo add plus later on - why? why not?

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name1>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day1>\d{1,2})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<month_name2>#{MONTH_NAMES})
      (?: \/|[ ] )
   (?<day2>\d{1,2})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix

DURATION_II_RE = variant ii e.g. 26 July - 27 July

%r{
(?<duration>
    \b
   ## optional day name
   ((?<day_name1>#{DAY_NAMES})
      [ ]
   )?
   (?<day1>\d{1,2})
      [ ]
   (?<month_name1>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year1>\d{4})
   )?

   ## support + and -  (add .. or such - why??)
   [ ]*[-][ ]*

   ## optional day name
   ((?<day_name2>#{DAY_NAMES})
      [ ]
   )?
   (?<day2>\d{1,2})
      [ ]
   (?<month_name2>#{MONTH_NAMES})
   ## optional year
   ( [ ]
      (?<year2>\d{4})
   )?
   \b
)}ix

DURATION_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
   DURATION_I_RE,
   DURATION_II_RE
)

TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)
    (?<text>
           ## positive lookbehind
           ##  (MUST be fixed number of chars - no quantifier e.g. +? etc.)
            (?<=[ ,;@|\[\]]
                 |^
            )
            (?:
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC)
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      \.?     ## optional dot
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+
                  |
                ## opt 3 - add weirdo case
                ##   e.g. 5.-8. Platz Playoffs  - keep - why? why not?
                    \d+\.-\d+\.  [ ]? \p{L}+
               )

              (?:(?:  (?:[ ]
                     (?!vs?\.?[ ])    ## note - exclude (v[ ]/vs[ ]/v.[ ]/vs.[ ])
                       )
                      |     # only single spaces allowed inline!!!
                     [-]
                  )?
                (?:
                  \p{L} |
                  [&/'°]
                    |
                 (?:
                   \d+
                   (?!
                     [0-9h'+-] |    ## protected break on 12h / 12' / 1-1
                                    ##  check usege for 3+4 - possible? where ? why?
                     (?:[.:]\d)     ## protected/exclude/break on 12.03 / 12:03
                    )
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                   ##   note - remove / (slash) e.g. allows UDI'19/Beter Bed
                 )|
                 \.
               )
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)


            ## allow optional at the end
            ##  tag or year
            ##   make it and in the future - why? why not?
            ##
            ## change - fix
            ##   do NOT use (A) for amateur
            ##   use A or A. with NO ()!!!
            ## (A) -    allow with predined  alpha only for now
            ##          e.g. (A) - amateur a team or b?
            ###  same for U21 or U9 etc
            ##        use with NO ()!!! - why? why not?
            ##      or U21 U9 etc.   - why? why not?
            ##       or etc.
            ## (1879-1893) or allow years e.g. (1879-1893)
            ###
            ##    add allow country code three to five letters for now
            ##       change to generic 1 to 5 - why? why not?
            ##     e.g. (A), (I),
            ##          (AUT)
            ##          (TRNC)   five? for UEFA code for northern cyprus
            ##     change to 1 to 4 - why? why not?
            ##   check - fix possible for upper case only here
            ##                     inline for this group only?
            (?:
               [ ]
               \(
                  \d{4}-\d{4}
               \)
            )?
             (?:
               [ ]+   ## allow more than once space - why? why not?
                  \( (?:
                       [A-Z]{1,5}
                     )
                  \)
             )?
            ## add lookahead/lookbehind
           ##    must be space!!!
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            (?=[ ,;@|\[\]]
                 |$
            )
   )
}ix

P_EN = english helpers (penalty, extra time, …) note - p must go last (shortest match) pso = penalty shootout

'(?: pso | pen\.? | p\.? )'

ET_EN = e.g. p., p, pen, pen., PSO, etc.

'(?: aet | a\.e\.t\.? )'

SCORE__P_ET__RE = note: allow SPECIAL cases WITHOUT full time scores (just a.e.t or pen. + a.e.t.) 3-4 pen. 2-2 a.e.t. 3-4 pen. 2-2 a.e.t. 2-2 a.e.t.

%r{
(?<score>
   \b
    (?:
       (?<p1>\d{1,2}) - (?<p2>\d{1,2})
         [ ]* #{P_EN} [ ]+
     )?             # note: make penalty (P) score optional for now
    (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN}
       (?=[ \]]|$)
)}ix

SCORE__P__RE = note: allow SPECIAL with penalty only 3-4 pen.

%r{
        (?<score>
  \b
     (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN}
       (?=[ \]]|$)
)}ix

SCORE__P_ET_FT_HT__RE = e.g. 3-4 pen. 2-2 a.e.t. (1-1, 1-1) or 3-4p 2-2aet (1-1, ) or 3-4 pen. 2-2 a.e.t. (1-1) or 2-2 a.e.t. (1-1, 1-1) or 2-2 a.e.t. (1-1, ) or 2-2 a.e.t. (1-1)

%r{
          (?<score>
   \b
   (?:
    (?<p1>\d{1,2}) - (?<p2>\d{1,2})
       [ ]* #{P_EN} [ ]+
    )?            # note: make penalty (P) score optional for now
   (?<et1>\d{1,2}) - (?<et2>\d{1,2})
       [ ]* #{ET_EN} [ ]+
       \(
       [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
       [ ]*
    (?:
         , [ ]*
        (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
            [ ]*
        )?
    )?              # note: make half time (HT) score optional for now
  \)
 (?=[ \]]|$)
)}ix

SCORE__P_FT_HT__RE = special case for case WITHOUT extra time!! same as above (but WITHOUT extra time and pen required)

%r{
         (?<score>
            \b
 (?<p1>\d{1,2}) - (?<p2>\d{1,2})
    [ ]* #{P_EN} [ ]+
    \(
    [ ]*
  (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
    [ ]*
 (?:
      , [ ]*
     (?: (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
         [ ]*
     )?
 )?              # note: make half time (HT) score optional for now
   \)
  (?=[ \]]|$)
)}ix

SCORE__FT_HT__RE = e.g. 2-1 (1-1) or 2-1

%r{
            (?<score>
 \b
 (?<ft1>\d{1,2}) - (?<ft2>\d{1,2})
  (?:
      [ ]+ \( [ ]*
   (?<ht1>\d{1,2}) - (?<ht2>\d{1,2})
      [ ]* \)
  )?   # note: make half time (HT) score optional for now
(?=[ \]]|$)
)}ix

SCORE_RE = map tables note: order matters; first come-first matched/served

Regexp.union(
  SCORE__P_ET_FT_HT__RE,  # e.g. 5-1 pen. 2-2 a.e.t. (1-1, 1-0)
  SCORE__P_FT_HT__RE,     # e.g. 5-1 pen. (1-1)
  SCORE__P_ET__RE,        # e.g. 2-2 a.e.t.  or  5-1 pen. 2-2 a.e.t.
  SCORE__P__RE,           # e.g. 5-1 pen.
  SCORE__FT_HT__RE,        # e.g. 1-1 (1-0) or 1-1  -- note - must go last!!!
)

Class Method Summary collapse

.build_map(lines, downcase: false) ⇒ Object
.build_names(lines) ⇒ Object
.more_round_names ⇒ Object
.parse_date(str, start:) ⇒ Object

add a date parser helper.
.parse_names(txt) ⇒ Object
.read_names(path) ⇒ Object

add more round names in different languages via txt files.

Instance Method Summary collapse

#is_group?(text) ⇒ Boolean
#is_leg?(text) ⇒ Boolean

Pair matches/games if marked with leg1 n leg2.
#is_round?(text) ⇒ Boolean
#log(msg) ⇒ Object
#parse(line, debug: false) ⇒ Object

convience helper - ignore errors by default.
#parse_with_errors(line, debug: false) ⇒ Object
#tokenize(line, typed: false, debug: false) ⇒ Object

convience helper - ignore errors by default.
#tokenize_with_errors(line, typed: false, debug: false) ⇒ Object

Class Method Details

.build_map(lines, downcase: false) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 40

def self.build_map( lines, downcase: false )
   ## note: downcase name!!!
  ## build a lookup map that maps the word to the index (line no) plus 1 e.g.
  ##  {"january" => 1,  "jan" => 1,
  ##   "february" => 2, "feb" => 2,
  ##   "march" => 3,    "mar" => 3,
  ##   "april" => 4,    "apr" => 4,
  ##   "may" => 5,
  ##   "june" => 6,     "jun" => 6, ...
  lines.each_with_index.reduce( {} ) do |h,(line,i)|
    line.each do |name|
       h[ downcase ? name.downcase : name ] = i+1
    end  ## note: start mapping with 1 (and NOT zero-based, that is, 0)
    h
  end
end

.build_names(lines) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 33

def self.build_names( lines )
  ## join all words together into a single string e.g.
  ##   January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|...
  lines.map { |line| line.join('|') }.join('|')
end

.more_round_names ⇒ `Object`

# File 'lib/sportdb/parser/lang.rb', line 180

def self.more_round_names
   @more_round_name ||= begin
                           names = []
                           langs = ['en', 'de', 'es', 'pt', 'misc']
                           ## sort names by length??
                           langs.each do |lang|
                             path = "#{SportDb::Module::Parser.root}/config/rounds_#{lang}.txt"
                             names += read_names( path )
                           end
                           names
                        end
end

.parse_date(str, start:) ⇒ `Object`

add a date parser helper

# File 'lib/sportdb/parser/token-date.rb', line 160

def self.parse_date( str, start: )
    if m=DATE_RE.match( str )

      year    = m[:year].to_i(10)  if m[:year]
      month   = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
      day     = m[:day].to_i(10)   if m[:day]
      wday    = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]

      if year.nil?   ## try to calculate year
        year =  if  month > start.month ||
                   (month == start.month && day >= start.day)
                  # assume same year as start_at event (e.g. 2013 for 2013/14 season)
                  start.year
                else
                  # assume year+1 as start_at event (e.g. 2014 for 2013/14 season)
                  start.year+1
                end
      end
      Date.new( year,month,day )
    else
      puts "!! ERROR - unexpected date format; cannot parse >#{str}<"
      exit 1
    end
end

.parse_names(txt) ⇒ `Object`

# File 'lib/sportdb/parser/token-date.rb', line 6

def self.parse_names( txt )
  lines = [] # array of lines (with words)

  txt.each_line do |line|
    line = line.strip

    next if line.empty?
    next if line.start_with?( '#' )   ## skip comments too

    ## strip inline (until end-of-line) comments too
    ##   e.g. Janvier  Janv  Jan  ## check janv in use??
    ##   =>   Janvier  Janv  Jan

    line = line.sub( /#.*/, '' ).strip
    ## pp line

    values = line.split( /[ \t]+/ )
    ## pp values

    ## todo/fix -- add check for duplicates
    lines << values
  end
  lines

end

.read_names(path) ⇒ `Object`

add more round names in different languages

  via txt files

for now must match case - maybe make caseinsensitive later - why? why not?

# File 'lib/sportdb/parser/lang.rb', line 158

def self.read_names( path )
     txt = read_text( path )
     names = [] # array of lines (with words)
     txt.each_line do |line|
       line = line.strip

       next if line.empty?
       next if line.start_with?( '#' )   ## skip comments too

       ## strip inline (until end-of-line) comments too
       ##   e.g. Janvier  Janv  Jan  ## check janv in use??
       ##   =>   Janvier  Janv  Jan

       line = line.sub( /#.*/, '' ).strip
       ## pp line

       names << line
     end
     names
end

Instance Method Details

#is_group?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/sportdb/parser/lang.rb', line 20

def is_group?( text )
   ## use regex for match
   GROUP_RE.match?( text )
end

#is_leg?(text) ⇒ `Boolean`

Pair matches/games if marked with leg1 n leg2

Returns:

(Boolean)



211
212
213

# File 'lib/sportdb/parser/lang.rb', line 211

def is_leg?( text )
   LEG_RE.match?( text )
end

#is_round?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/sportdb/parser/lang.rb', line 194

def is_round?( text )
    ROUND_RE.match?( text ) ||
    self.class.more_round_names.include?( text )
end

#log(msg) ⇒ `Object`

# File 'lib/sportdb/parser/token.rb', line 133

def log( msg )
   ## append msg to ./logs.txt
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" )
   end
end

#parse(line, debug: false) ⇒ `Object`

convience helper - ignore errors by default

# File 'lib/sportdb/parser/parser.rb', line 206

def parse( line, debug: false )
  nodes, _ = parse_with_errors( line, debug: debug )
  nodes
end

#parse_with_errors(line, debug: false) ⇒ `Object`

# File 'lib/sportdb/parser/parser.rb', line 103

def parse_with_errors( line, debug: false )
    errors = []
    tokens, token_errors = tokenize_with_errors( line, typed: true )
    errors += token_errors

#############
## pass 1
##   replace all texts with keyword matches (e.g. group, round, leg, etc.)
     tokens = tokens.map do |t|
                      if t[0] == :text
                          text = t[1]
                          if is_group?( text )
                             [:group, text]
                          elsif is_leg?( text )
                             [:leg, text]
                          elsif is_round?( text )
                             [:round, text]
                          else
                              t   ## pass through as-is (1:1)
                          end
                      else
                         t
                      end
                end


    ## puts "tokens:"
    ## pp tokens

## transform tokens into (parse tree/ast) nodes
    nodes = []

    buf = Tokens.new( tokens )
    ## pp buf


    loop do
          break if buf.eos?

          ## simplify - remove separator for round + leg pair
          ##     e.g.  Round of 16, 1st Leg
          ##     allow Round of 16 - 1st Leg  too - why? why not?
          if buf.match?( :round, [:',', :'|',
                                    :'-',
                                    :vs,   ### fix - change parser to issue :'-' only for (-) not :vs!!!
                                    ], :leg )
                    nodes << [:round, buf.next[1]]
                    buf.next  ## swallow separator
                    nodes << [:leg, buf.next[1]]
                    next
          end


          if buf.pos == 0   ## MUST start line
            ## check for
            ##    group def or round def
            if buf.match?( :round, :'|', [:date, :duration] )    ## assume round def (change round to round_def)
                      nodes << [:round_def, buf.next[1]]
                      buf.next ## swallow pipe
                      nodes += buf.collect
                      break
            end
            if buf.match?( :group, :'|', :text )    ## assume group def (change group to group_def)
                      nodes << [:group_def, buf.next[1]]
                      buf.next ## swallow pipe
                      ## change all text to team
                      nodes += buf.collect { |t|
                                t[0] == :text ? [:team, t[1]] : t
                               }
                      break
            end
          end


          if buf.match?( :text, :'-', :text )  ## hacky? convert "generic" :- to :vs
             nodes << [:team, buf.next[1]]     ##    keep this rule/option - why? why not?
             nodes << [:vs]
             nodes << [:team, buf.next[1]]
          elsif buf.match?( :text, [:score, :vs], :text )
             nodes << [:team, buf.next[1]]
             nodes << buf.next
             nodes << [:team, buf.next[1]]
          elsif buf.match?( :text, :minute )
             nodes << [:player, buf.next[1]]
             nodes << buf.next
          elsif buf.cur == :'@'
               ## add all to the end as is
               ##   only change text to geo
              nodes += buf.collect  { |t|
                           t[0] == :text ? [:geo, t[1]] : t
                            }
              break
          else
             ## pass through
             nodes << buf.next
          end
    end

    [nodes,errors]
end

#tokenize(line, typed: false, debug: false) ⇒ `Object`

convience helper - ignore errors by default

# File 'lib/sportdb/parser/token.rb', line 345

def tokenize(  line, typed: false,
                     debug: false )
   tokens, _ = tokenize_with_errors( line, typed: typed,
                                           debug: debug )
   tokens
end

#tokenize_with_errors(line, typed: false, debug: false) ⇒ `Object`

# File 'lib/sportdb/parser/token.rb', line 144

def tokenize_with_errors( line, typed: false,
                                debug: false )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?

  puts ">#{line}<"    if debug

  pos = 0
  ## track last offsets - to report error on no match
  ##   or no match in end of string
  offsets = [0,0]
  m = nil

  while m = RE.match( line, pos )
    if debug
      pp m
      puts "pos: #{pos}"
    end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!
      msg =  "!! WARN - parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
      log( msg )
    end

    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    pos = offsets[1]

    pp offsets   if debug

    t = if m[:space]
           ## skip space
           nil
        elsif m[:spaces]
           ## skip spaces
           nil
        elsif m[:text]
          [:text, m[:text]]   ## keep pos - why? why not?
        elsif m[:status]   ## (match) status e.g. cancelled, awarded, etc.
          if m[:status_note]   ## includes note? e.g.  awarded; originally 2-0
             [:status, m[:status], {note:m[:status_note]}]
          else
             [:status, m[:status]]
          end
        elsif m[:time]
          if typed
              ## unify to iso-format
              ###   12.40 => 12:40
              ##    12h40 => 12:40 etc.
              ##  keep string (no time-only type in ruby)
              hour =   m[:hour].to_i(10)  ## allow 08/07/etc.
              minute = m[:minute].to_i(10)
              ## check if valid -  0:00 - 24:00
              ##   check if 24:00 possible? or only 0:00 (23:59)
              if (hour >= 0 && hour <= 24) &&
                 (minute >=0 && minute <= 59)
               ## note - for debugging keep (pass along) "literal" time
               ##   might use/add support for am/pm later
               [:time, m[:time], {h:hour,m:minute}]
              else
                 raise ArgumentError, "parse error - time >#{m[:time]}< out-of-range"
              end
          else
            [:time, m[:time]]
          end
        elsif m[:date]
          if typed
            date = {}
=begin
            ((?<day_name>#{DAY_NAMES})
            [ ]
       )?
       (?<month_name>#{MONTH_NAMES})
           (?: \/|[ ] )
       (?<day>\d{1,2})
       ## optional year
       (  [ ]
          (?<year>\d{4})
       )?
=end
 ## map month names
 ## note - allow any/upcase JULY/JUL etc. thus ALWAYS downcase for lookup
            date[:y] = m[:year].to_i(10)  if m[:year]
            date[:m] = MONTH_MAP[ m[:month_name].downcase ]   if m[:month_name]
            date[:d]  = m[:day].to_i(10)   if m[:day]
            date[:wday] = DAY_MAP[ m[:day_name].downcase ]   if m[:day_name]
            ## note - for debugging keep (pass along) "literal" date
            [:date, m[:date], date]
          else
            [:date, m[:date]]
          end
        elsif m[:timezone]
          [:timezone, m[:timezone]]
        elsif m[:duration]
          if typed
            duration = { start: {}, end: {}}
            duration[:start][:y] = m[:year1].to_i(10)  if m[:year1]
            duration[:start][:m] = MONTH_MAP[ m[:month_name1].downcase ]   if m[:month_name1]
            duration[:start][:d]  = m[:day1].to_i(10)   if m[:day1]
            duration[:start][:wday] = DAY_MAP[ m[:day_name1].downcase ]   if m[:day_name1]
            duration[:end][:y] = m[:year2].to_i(10)  if m[:year2]
            duration[:end][:m] = MONTH_MAP[ m[:month_name2].downcase ]   if m[:month_name2]
            duration[:end][:d]  = m[:day2].to_i(10)   if m[:day2]
            duration[:end][:wday] = DAY_MAP[ m[:day_name2].downcase ]   if m[:day_name2]
            ## note - for debugging keep (pass along) "literal" duration
            [:duration, m[:duration], duration]
          else
            [:duration, m[:duration]]
          end
        elsif m[:num]
          if typed
              ## note -  strip enclosing () and convert to integer
             [:num, m[:value].to_i(10)]
          else
             [:num, m[:num]]
          end
        elsif m[:score]
          if typed
              score = {}
              ## check for pen
              score[:p] = [m[:p1].to_i(10),
                           m[:p2].to_i(10)]  if m[:p1] && m[:p2]
              score[:et] = [m[:et1].to_i(10),
                            m[:et2].to_i(10)]  if m[:et1] && m[:et2]
              score[:ft] = [m[:ft1].to_i(10),
                            m[:ft2].to_i(10)]  if m[:ft1] && m[:ft2]
              score[:ht] = [m[:ht1].to_i(10),
                            m[:ht2].to_i(10)]  if m[:ht1] && m[:ht2]

            ## note - for debugging keep (pass along) "literal" score
            [:score, m[:score], score]
          else
            [:score, m[:score]]
          end
        elsif m[:minute]
          if typed
              minute = {}
              minute[:m]      = m[:value].to_i(10)
              minute[:offset] = m[:value2].to_i(10)   if m[:value2]
             ## note - for debugging keep (pass along) "literal" minute
             [:minute, m[:minute], minute]
          else
             [:minute, m[:minute]]
          end
        elsif m[:og]
          typed  ?  [:og] : [:og, m[:og]]    ## for typed drop - string version/variants
        elsif m[:pen]
          typed  ?  [:pen] : [:pen, m[:pen]]
        elsif m[:vs]
          typed  ?  [:vs] : [:vs, m[:vs]]
        elsif m[:none]
          typed  ?  [:none] : [:none, m[:none]]
        elsif m[:sym]
          sym = m[:sym]
          ## return symbols "inline" as is - why? why not?
          case sym
          when ',' then [:',']
          when ';' then [:';']
          when '@' then [:'@']
          when '|' then [:'|']
          else
            nil  ## ignore others (e.g. brackets [])
          end
        else
          ## report error
          nil
        end

    tokens << t    if t

    if debug
      print ">"
      print "*" * pos
      puts "#{line[pos..-1]}<"
    end
  end

  ## check if no match in end of string
  if offsets[1] != line.size
    msg =  "!! WARN - parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
  end


  [tokens,errors]
end

Class: SportDb::Parser

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.build_map(lines, downcase: false) ⇒ Object

.build_names(lines) ⇒ Object

.more_round_names ⇒ Object

.parse_date(str, start:) ⇒ Object

.parse_names(txt) ⇒ Object

.read_names(path) ⇒ Object

Instance Method Details

#is_group?(text) ⇒ Boolean

#is_leg?(text) ⇒ Boolean

#is_round?(text) ⇒ Boolean

#log(msg) ⇒ Object

#parse(line, debug: false) ⇒ Object

#parse_with_errors(line, debug: false) ⇒ Object

#tokenize(line, typed: false, debug: false) ⇒ Object

#tokenize_with_errors(line, typed: false, debug: false) ⇒ Object

.build_map(lines, downcase: false) ⇒ `Object`

.build_names(lines) ⇒ `Object`

.more_round_names ⇒ `Object`

.parse_date(str, start:) ⇒ `Object`

.parse_names(txt) ⇒ `Object`

.read_names(path) ⇒ `Object`

#is_group?(text) ⇒ `Boolean`

#is_leg?(text) ⇒ `Boolean`

#is_round?(text) ⇒ `Boolean`

#log(msg) ⇒ `Object`

#parse(line, debug: false) ⇒ `Object`

#parse_with_errors(line, debug: false) ⇒ `Object`

#tokenize(line, typed: false, debug: false) ⇒ `Object`

#tokenize_with_errors(line, typed: false, debug: false) ⇒ `Object`