Class: Rsssf::Parser

Inherits:

Object

Object
Rsssf::Parser

show all

Defined in:: lib/rsssf/parser/token.rb,
lib/rsssf/parser/linter.rb,
lib/rsssf/parser/parser.rb,
lib/rsssf/parser/token-date.rb,
lib/rsssf/parser/token-note.rb,
lib/rsssf/parser/token-text.rb,
lib/rsssf/parser/token-goals.rb,
lib/rsssf/parser/token-round.rb,
lib/rsssf/parser/token-score.rb

Defined Under Namespace

Classes: Linter

Constant Summary collapse

BASICS_RE =

%r{
    (?<spaces> [ ]{2,}) |
    (?<space>  [ ]) 
        |
    (?<sym>[;,@|\[\]\(\)])     ## note - add () too  - why? why not?
}ix

VS_RE =

%r{   ## must be space before and after!!!
    (?<vs>
      (?<=[ ])	# Positive lookbehind for space
         -
       (?=[ ])   # positive lookahead for space 
    )
}ix

RE =

Regexp.union(  GROUP_RE, ROUND_RE, LEG_RE,
                    DATE_RE,
                    VS_RE,
                    SCORE_RE,
                    SCORE_AWD_RE, SCORE_ABD_RE, SCORE_PPD_RE, SCORE_NP_RE,
  SCORE_WO_RE,
                    SCORE_EXT_RE,
                    NOTE_RE,
                    BASICS_RE,
TEXT_RE )

HYPHEN_RE = rename to dash or to ??? used to add/allow hyphen/dash (-) in INSIDE_RE

%r{   ## must be space before and after (or end of line)!!!
  ##  note - uses SYM capture 
    (?<sym>
      (?<=[ ])	# Positive lookbehind for space
         -
       (?=[ ]|$)   # positive lookahead for space 
    )
}ix

SCORE_AT_RE = rename to ?? use SCORE_AT for now - why? why not? add support for score at/score points/markers e.g. [1-0 Andrei 08, 1-1 Rydlewicz 24, 1-2 Prica 85, 2-2 Bella 88, 2-3 Arvidsson 102]

%r{ (?<score_at> 
      \b
      \d{1,2}-\d{1,2}
      \b
    )  
}ix

INSIDE_RE = “strict” text match mode inside brackets ]

Regexp.union(  SCORE_AT_RE,
   GOAL_OG_RE, GOAL_PEN_RE,
   BASICS_RE, HYPHEN_RE,
   TEXT_STRICT_RE,
   MINUTE_RE, 
)

SYM_CLOSE = open/close pairs - lookup close (by open char)

{
  '(' => ')',
  '[' => ']',
}

MONTH_LINES = note - support only 5 letter max for now now January|February|August etc.

SportDb::Parser.parse_names( <<TXT )
Jan
Feb
March      Mar
April      Apr
May
June       Jun
July       Jul
Aug
Sept       Sep
Oct
Nov
Dec
TXT

MONTH_NAMES =

SportDb::Parser.build_names( MONTH_LINES )

MONTH_MAP = pp MONTH_NAMES

SportDb::Parser.build_map( MONTH_LINES, downcase: true )

DAY_LINES = nnote - only support two or three letters no Tues | Thur | Thurs | Sunday etc.

SportDb::Parser.parse_names( <<TXT )
Mon  Mo
Tue  Tu
Wed  We
Thu  Th
Fri  Fr
Sat  Sa
Sun  Su
TXT

DAY_NAMES =

SportDb::Parser.build_names( DAY_LINES )

DAY_MAP = pp DAY_NAMES

SportDb::Parser.build_map( DAY_LINES, downcase: true )

DATE_RE = e.g. Fri Aug 9

%r{
 ## note - do not include [] in capture for now - why? why not
    ## eat-up/consume optional [] - part i
    (?: \[ | \b
     )
(?<date>

     (?:  ######  
          ## variant I/1/one
          ###   Fri June 24 

     ## optional day name
     ((?<day_name>#{DAY_NAMES})
          [ ]
     )?    
     ##  allow 1 or 2 spaces e.g. Jul  2 / Jun 27 to pretty print
     (?<month_name>#{MONTH_NAMES})
         [ ]{1,2}
     (?<day>\d{1,2})
     ## optional year
     (  [ ]
        (?<year>\d{4})
     )?   
     )
    |
     (?: #### 
         ## variant II/2/two
         ##   17- 3-22   - allow space befor mont
         ##   17-3-22
            \d{1,2}
             -
            [ ]*\d{1,2} 
             -
             (?:
                \d{4} |   ## 2024
                \d{2}     ## or 24 only
             )
     )
     )  ## end date capture
  ## eat-up/consume optional [] - part ii
  (?: \] | \b
  )        
}ix

NOTE_BASICS_RE = move to token-note(s) file !!!!

%r{
    (?<note_open> \[ )
   (?<note>
     (?:  ##  starting with ___   PLUS requiring more text
       (?:
          nb:
          ##  e.g. [NB: between top-8 of regular season]
          #        [NB: América, Morelia and Tigres qualified on better record regular season]
          #        [NB: Celaya qualified on away goals]
          #        [NB: Alebrijes qualified on away goal]
          #        [NB: Leones Negros qualified on away goals]
          #
          # todo/fix:
          # add "top-level" NB: version
          ##   with full (end-of) line note - why? why not?
          |
          (?: originally[ ])? scheduled
          ## e.g. [originally scheduled to play in Mexico City] 
          |
          rescheduled
          ## e.g.  [Rescheduled due to earthquake occurred in Mexico on September 19]
          |
          remaining
          ## e.g. [remaining 79']   
          ##      [remaining 84'] 
          ##      [remaining 59']   
          ##      [remaining 5']
          |
          played  
          ## e.g. [played in Macaé-RJ]
          ##      [played in Caxias do Sul-RS]
          ##      [played in Sete Lagoas-MG]
          ##      [played in Uberlândia-MG]
          ##      [played in Brasília-DF]
          ##      [played in Vöcklabruck]
          ##      [played in Pasching]
          |
          declared
          ## e.g.  [declared void]
          |
          inter-group
          ## e.g. [inter-group A-B]
          ##      [inter-group C-D]
       )
      [ ]
      [^\]]+?    ## slurp all to next ] - (use non-greedy) 
     )
      |
     (?:
       ## starting with in  - do NOT allow digits
       ##   name starting with in possible - why? why not?
           in[ ]
            [^0-9\]]+?
       ## e.g. [In Estadio La Corregidora] 
       ##      [in Unidad Deportiva Centenario]
       ##      [in Estadio Olímpico Universitario]
       ##      [in Estadio Victoria]
       ##      [in UD José Brindis]
       ##      [in Colomos Alfredo "Pistache" Torres stadium]
     )
      |
      (?:
          (?:
             postponed    
             ## e.g. [postponed due to problems with the screen of the stadium]
             ##      [postponed by storm]
             ##      [postponed due to tropical storm "Hanna"]
             ##      [postponed from Sep 10-12 due to death Queen Elizabeth II]
             ##     [postponed]  -- include why? why not?
             |
             awarded
             ## e.g. [awarded match to Leones Negros by undue alignment; original result 1-2]
             ##     [awarded 3-0 to Cafetaleros by undue alignment; originally ended 2-0]
             ##     [awarded 3-0; originally 0-2, América used ineligible player (Federico Viñas)]
             |
             abandoned
             ## e.g. [abandoned at 1-1 in 65' due to cardiac arrest Luton player Tom Lockyer]
             ##      [abandoned at 0-0 in 6' due to waterlogged pitch]
             ##     [abandoned at 5-0 in 80' due to attack on assistant referee by Cerro; result stood]
             ##    [abandoned at 1-0 in 31']
             ##    [abandoned at 0-1' in 85 due to crowd trouble]
             |
              suspended
              ## e.g. [suspended at 0-0 in 12' due to storm]  
              ##      [suspended at 84' by storm; result stood]
              |
              annulled
              ## e.g.  [annulled]
              |
              replay
              ## e.g.  [replay]
              |
              verified
              ## e.g.  [verified 2:0 wo.]
          )
        ([ ]    ## note - optional text
          [^\]]+?
         )?         ## slurp all to next ] - (use non-greedy) 
      )
    )    # note capture  
        
     (?: 
         (?<note_close> \] )
         | $ ## note - allow open notes (that continue on next line) 
      )  
}ix

NOTE_MORE_RE =

%r{
      (?<=[ ])  ## one (leading) space min. required
       (?<note_cont>
             [⮑…] |
             \.{2,3}   ### .. or ...
       )
        [ ]*
       (?<note>
            [^\]]+?   ## non-greeedy
          )  
       (?: 
         (?<note_close> \] )
         | $ ## note - allow open notes (that continue on next line) 
       )  
}ix

NOTE_RE =

Regexp.union(  NOTE_BASICS_RE, 
 NOTE_MORE_RE, 
)

TEXT_QUOTED = simple (double) quoted text only supports a-z (unicode) PLUS (single) inline space add more chars - why? why not?

'(?:  "    ' +
'  \p{L}+  ' + 
'     (?: [ ]  ' +
'        \p{L}+ )*   '  + 
'    "  )  '

TEXT_STRICT_RE =

%r{
   (?<text>
         (?: \b |  #{TEXT_QUOTED} [ ]   ## note - leading quoted text must be followed by space!!
          )
          \p{L}+    ## all unicode letters (e.g. [a-z])
           
             (?:
               (?:[ ]
                    |     # only single spaces allowed inline!!!
                   [-]                                              
               )?
               (?:
                  \p{L}+ |
                   ['.] |
                   (?:
                      (?<= [ ])
                      #{TEXT_QUOTED}
                      (?= [ ]|$)   ### must be followed by space
                                  ##  todo/fix - add all end of text lookaheads to (see below)
                   )
               )  
              )*  
               ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)
   
        ## positive lookahead
        ##   cannot use \b  if text ends in dot (.) or other non-alphnum 
        ##        than \b will not work
        ##   not    - add () too for now - why? why not? 
            (?=[ ,;@|\[\]\(\)]  
                 |$
            )  
    )
}ix

TEXT_RE =

%r{
    ## must start with alpha (allow unicode letters!!)      
    (?<text>    
             \b   ## use/require word boundary
            (?:  
                # opt 1 - start with alpha
                 \p{L}+    ## all unicode letters (e.g. [a-z])
                   |

                # opt 2 - start with num!! - allow special case (e.g. 1. FC) 
                     \d+  # check for num lookahead (MUST be space or dot)
                      ## MUST be followed by (optional dot) and
                      ##                      required space !!!
                      ## MUST be follow by a to z!!!!
                      \.?     ## optional dot
                      [ ]?   ## make space optional too  - why? why not?
                             ##  yes - eg. 1st, 2nd, 5th etc.
                       \p{L}+               
               )
        
              (?:(?:  (?:[ ]
                     (?! (awd|abd|ppd|n/p|w/o)[ ])    ## note - exclude (awd[ ]/abd[ ]/n/p[ ])
                       )  
                      |     # only single spaces allowed inline!!!
                     [-]                                              
                  )?
                (?:
                  \p{L}+ | [&/'.] 
                    |
                 (?:
                   \d+ 
                   (?![0-9.:'/+-])   
                   ## negative lookahead for numbers
                   ##   note - include digits itself!!!
                 )  
               )  
              )*  ## must NOT end with space or dash(-)
              ##  todo/fix - possible in regex here
              ##     only end in alphanum a-z0-9 (not dot or & ???)

              ## support (Hamburg) or such at the end (ony)
              ##   note - no numbers allowed inside () for now!!
             (?:
                  [ ]\(\p{L}+
                      (?:
                         (?: [ ] |
                             [-]
                          )? 
                          \p{L}+ | [&/'.]
                        )*
                      \)
             )?


            ## add lookahead/lookbehind
           ##    must be space!!! 
           ##   (or comma or  start/end of string)
           ##   kind of \b !!!
            ## positive lookahead
            ##  note - added : too - why? why not?
            (?=[ ,;@|:\[\]]
                 |$
            )
   )   
}ix

MINUTE_RE =

%r{
     (?<minute>
   \b
      \d{1,3}   
      '?   ## optional minute quote (')
      (?:       
         # optional offset/extra e.g. 45+ / 90+ or 45+10 / 90+5
          (?: \+
            (?: 
               (?! [0-9])   ## negative look ahead (not a number) required
               |    
              (?:
                \d{1,3} 
                '?   ## optional minute quote (')
                (?= (og|o|pen|p)? ([ ;,\]\)]|$))
              )
            )
          )
          |
          (?= (og|o|pen|p)? ([ ;,\]\)]|$))  # note - break can be og|pen|p too
   )
)}ix

GOAL_PEN_RE = goal types

%r{
   (?<pen> 
        (?<=\d|\+|[ ]|')	## must follow a number or plus (e.g. 45p / 45+p / 45 p / 45'p) or space
            (?: pen|p )
            \b 
    )
}ix

GOAL_OG_RE =

%r{
   (?<og> 
        (?<=\d|\+|[ ]|')	## must follow a number or plus (e.g. 45og / 45+og / 45 og) or space
          (?: og|o )
          \b
   )
}ix

GROUP_RE = Group A-Z Group 1-99 Group HEX # used in concaf world cup quali Group 1A or A1, B1 - used anywhere use "key" of group - why? why not?

%r{(?<group>
     \b
    Group [ ]
       [a-z0-9]+
\b)}ix

ROUND_RE =

%r{(?<round>
            \b
   (?:
   # round  - note - requiers number e.g. round 1,2, etc.
(?:  (?: Round |
        Matchday |
        Week
     )
     [ ] [0-9]+
)
|
   # more (kockout) rounds
   # playoffs  - playoff, play-off, play-offs
  (?: Play-?offs? 
     (?: [ ]for[ ]quarter-?finals )?
  )
  |    
   # round32
  (?: Round[ ]of[ ]32 | 
      Last[ ]32 |
      16th[ ]finals |  
      1/16[ ]finals
      )
    |
   # round16   
  (?: Round[ ]of[ ]16 |
      Last[ ]16 | 
      8th[ ]finals |
      1/8[ ]finals 
      )
     |
   # fifthplace
   (?:
       (?: (Fifth|5th)[ -]place 
            (?: [ ] (?: match|play-?off|final ))?
        ) |
       (?: Match[ ]for[ ](?: fifth|5th )[ -]place )
   )
    |
   # thirdplace
    (?: 
        (?: (Third|3rd)[ -]place 
               (?: [ ] (?: match|play-?off|final ))? 
         ) |
        (?: Match[ ]for[ ](?: third|3rd )[ -]place ) 
     )
     |
   # quarterfinals
   (?:
        Quarter-?finals? |
        Quarters |
        Last[ ]8
    )
    |     
   # semifinals
  (?:   
       Semi-?finals? |
       Semis |
       Last[ ]4
  )
  |
   # final
   Finals? 
 )
\b)}ix

LEG_RE = keep leg separate (from round) - why? why not?

%r{ (?<leg>
              \b
  (?:
   # leg1
 (?: 1st|First)[ ]legs? 
 |
  # leg2 
 (?: 2nd|Second)[ ]legs?
  )
\b)}ix

SCORE_RE = e.g. 2-1

%r{      
  (?<score>
      (?<=[ ])	# Positive lookbehind for space
         (?<score1>\d{1,2}) - (?<score2>\d{1,2})
      (?=[ ])   # positive lookahead for space 
  )
}ix

SCORE_EXT_RE =

%r{ \[
    (?<score_ext>
        (?:       ## aet only e.g.  aet
           aet
           (?:   ##  optional pen
             [,;][ ]*
             \d{1,2}-\d{1,2} [ ]? pen\.? 
           )?
        )
        |
        (?:   ##  penalty only e.g. 3-2 pen
          \d{1,2}-\d{1,2} [ ]? pen\.?
        )
    )
  \]
}ix

SCORE_AWD_RE = awd - awarded

%r{  ## must be space before and after!!!
    (?<score_awd>
      (?<=[ ])	# Positive lookbehind for space
        awd
       (?=[ ])   # positive lookahead for space 
    )
}ix

SCORE_ABD_RE = abd - abandoned

%r{  ## must be space before and after!!!
    (?<score_abd>
      (?<=[ ])	# Positive lookbehind for space
        abd
       (?=[ ])   # positive lookahead for space 
    )
}ix

SCORE_PPD_RE = ppd - postponed

%r{  ## must be space before and after!!!
    (?<score_ppd>
      (?<=[ ])	# Positive lookbehind for space
        ppd
       (?=[ ])   # positive lookahead for space 
    )
}ix

SCORE_NP_RE = n/p - not played

%r{  ## must be space before and after!!!
    (?<score_np>
      (?<=[ ])	# Positive lookbehind for space
         n/p
       (?=[ ])   # positive lookahead for space 
    )
}ix

SCORE_WO_RE = A walkover, also W.O. or w/o (originally two words: “walk over”), is awarded to the opposing team/player etc, if there are no other players available, or they have been disqualified, because the other contestants have forfeited or the other contestants have withdrawn from the contest. w/o - walk over

%r{  ## must be space before and after!!!
    (?<score_wo>
      (?<=[ ])	# Positive lookbehind for space
         w/o
       (?=[ ])   # positive lookahead for space 
    )
}ix

Instance Method Summary collapse

#log(msg) ⇒ Object
#parse(line, debug: false) ⇒ Object

convience helper - ignore errors by default.
#parse_with_errors(line, debug: false) ⇒ Object
#tokenize(line, debug: false) ⇒ Object

convience helper - ignore errors by default.
#tokenize_with_errors(line, debug: false) ⇒ Object

Instance Method Details

#log(msg) ⇒ `Object`

# File 'lib/rsssf/parser/token.rb', line 76

def log( msg )
   ## append msg to ./logs.txt  
   ##     use ./errors.txt - why? why not?
   File.open( './logs.txt', 'a:utf-8' ) do |f|
     f.write( msg )
     f.write( "\n" ) 
   end
end

#parse(line, debug: false) ⇒ `Object`

convience helper - ignore errors by default

# File 'lib/rsssf/parser/parser.rb', line 92

def parse( line, debug: false )
  nodes, _ = parse_with_errors( line, debug: debug )
  nodes
end

#parse_with_errors(line, debug: false) ⇒ `Object`

# File 'lib/rsssf/parser/parser.rb', line 20

def parse_with_errors( line, debug: false )
    errors = []
    tokens, token_errors = tokenize_with_errors( line )
    errors += token_errors


=begin
#############
## pass 1 
##   replace all texts with keyword matches (e.g. group, round, leg, etc.)
     tokens = tokens.map do |t|
                      if t[0] == :text
                          text = t[1]
                          if is_group?( text )
                             ### expects to be followed by num (or text ABC??)
                             [:group, text]   
                          elsif is_matchday?( text )
                             ### expects to be followed by num
                             ##  use different name e.g. :fix_round or such?
                             [:matchday, text]   
                          elsif is_leg?( text )
                             [:leg, text]
                          elsif is_round?( text )
                             [:round, text]
                          else
                              t   ## pass through as-is (1:1)
                          end
                      else
                         t
                      end
                end


    ## puts "tokens:"
    ## pp tokens
=end

## transform tokens into (parse tree/ast) nodes    
    nodes = []
    
    ## note - (re)use token buffer from "standard" parser here !!!!
    buf = SportDb::Parser::Tokens.new( tokens )
    ## pp buf


    loop do 
          if buf.match?( :text, [:score, 
                                 :score_awd,
                                 :score_abd,
                                 :score_ppd,
                                 :score_np,
                                 :score_wo,
                                 :vs], :text )
             nodes << [:team, buf.next[1]]
             nodes << buf.next
             nodes << [:team, buf.next[1]]
          elsif buf.match?( :text, :minute )    ## assume player+minute
             nodes << [:player, buf.next[1]]
             nodes << buf.next
          else
             ## pass through
             nodes << buf.next
          end

          break if buf.eos?
    end

    [nodes,errors]
end

#tokenize(line, debug: false) ⇒ `Object`

convience helper - ignore errors by default

# File 'lib/rsssf/parser/token.rb', line 281

def tokenize(  line, debug: false )
   tokens, _ = tokenize_with_errors( line, debug: debug )
   tokens
end

#tokenize_with_errors(line, debug: false) ⇒ `Object`

# File 'lib/rsssf/parser/token.rb', line 92

def tokenize_with_errors( line, debug: false )
  tokens = []
  errors = []   ## keep a list of errors - why? why not?

  puts ">#{line}<"    if debug

  pos = 0
  ## track last offsets - to report error on no match 
  ##   or no match in end of string
  offsets = [0,0]
  m = nil

  ####
  ## quick hack - keep re state/mode between tokenize calls!!!
  @re  ||= RE     ## note - switch between RE & INSIDE_RE
  


  while m = @re.match( line, pos )
    if debug
      pp m
      puts "pos: #{pos}"  
    end
    offsets = [m.begin(0), m.end(0)]

    if offsets[0] != pos
      ## match NOT starting at start/begin position!!!
      ##  report parse error!!!

      ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE'  ## assume RE
      ## fix/change - use str.inspect to show tabs (\t)
      ##          and possibly other special characters causing trouble     
      msg =  "  !! WARN - parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]} in line >#{line}<"
      puts msg

      errors << "parse error (#{ctx}) - skipping >#{line[pos..(offsets[0]-1)]}< @#{offsets[0]},#{offsets[1]}"
      log( msg )
    end

    ##
    ## todo/fix - also check if possible
    ##   if no match but not yet end off string!!!!
    ##    report skipped text run too!!!

    pos = offsets[1]

    pp offsets   if debug

    t =  if @re == INSIDE_RE
           if m[:space]
             nil   ## skip space
           elsif m[:spaces]
             nil  ## skip spaces
           elsif m[:text] 
             [:text, m[:text]]   ## keep pos - why? why not?
           elsif m[:minute]
             [:minute, m[:minute]]
           elsif m[:score_at]
             [:score_at, m[:score_at]]
          elsif m[:og]
             [:og, m[:og]]    ## for typed drop - string version/variants
           elsif m[:pen]
             [:pen, m[:pen]]
           elsif m[:sym]
             sym = m[:sym]
             ## return symbols "inline" as is - why? why not?
             case sym
             when ',' then [:',']
             when ';' then [:';']
             when '@' then [:'@']
             when '|' then [:'|']   
             when '-' then [:'-']                
             when '[', '('
               if sym == @sym_open   
                 ## report error - already in inside mode!!!
                 ##  e.g. another [ in [] or ( in ()
                 log( "warn - unexpected (opening) #{sym} in inside (goal) mode in line >#{line}<" )
               end
               nil
             when ']', ')'   ## allow [] AND () for inside mode
               ## puts "  leave inside match mode"
               if sym == @sym_close 
                   @re = RE
                   @sym_open  = nil  ## reset sym_open/close
                   @sym_close = nil
               end
               nil
             else
              nil  ## ignore others (e.g. brackets [])
             end
           else
             ## report error  - why? why not?
             nil
           end    
         else  ## assume standard mode/ctx
           if m[:space]
             nil   ## skip space
           elsif m[:spaces]
             nil  ## skip spaces
           elsif m[:text] 
             [:text, m[:text]]   ## keep pos - why? why not?
           elsif m[:note]
             [:note, m[:note]]
           elsif m[:group]
             [:group, m[:group]]
           elsif m[:round]
             [:round, m[:round]]
           elsif m[:leg]
             [:leg, m[:leg]]
           elsif m[:date]
             [:date, m[:date]]
           elsif m[:vs]
             [:vs, m[:vs]]
           elsif m[:score]
             [:score, m[:score]]
           elsif m[:score_awd]   # awarded (awd)
             [:score_awd, m[:score_awd]]
           elsif m[:score_abd]   # abandoned (abd)
             [:score_abd, m[:score_abd]]
           elsif m[:score_ppd]   # postponed (ppd)
             [:score_ppd, m[:score_ppd]]
           elsif m[:score_np]    # not played (n/p)
             [:score_np, m[:score_np]]
           elsif m[:score_wo]    # walk over (w/o)
             [:score_wo, m[:score_wo]]
           elsif m[:score_ext]
             [:score_ext, m[:score_ext]]
           elsif m[:sym]
             sym = m[:sym]
             ## return symbols "inline" as is - why? why not?
             case sym
             when ',' then [:',']
             when ';' then [:';']
             when '@' then [:'@']
             when '|' then [:'|']   
             when '[', '('
               ##  switch to inside mode!!!
               ## puts "  enter inside match mode"
               @re = INSIDE_RE
               @sym_open  =  sym      ## record open/close style - why? why not?
               @sym_close =  SYM_CLOSE[sym]
               nil
             when ']', ')'
               log( "warn - unexpected (closing) #{sym} in standard mode in line >#{line}<" )   
               ## already in standard mode/ctx
               ##  report warn/error - why? why not?
               nil
             else
               nil  ## ignore others (e.g. brackets [])
             end
           else
             ## report error  - why? why not?
             nil
           end
         end


    tokens << t    if t    

    if debug
      print ">"
      print "*" * pos
      puts "#{line[pos..-1]}<"
    end
  end


  ## check if no match in end of string
  if offsets[1] != line.size

    ## note - report regex context
    ##  e.g.  RE or INSIDE_RE  to help debugging/troubleshooting format errors
    ctx = @re == INSIDE_RE ? 'INSIDE_RE' : 'RE'  ## assume RE
    ## fix/change - use str.inspect to show tabs (\t)
    ##          and possibly other special characters causing trouble     

    msg =  "  !! WARN - parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size} in line >#{line}<"
    puts msg
    log( msg )

    errors << "parse error (#{ctx}) - skipping >#{line[offsets[1]..-1]}< @#{offsets[1]},#{line.size}"
  end


  [tokens,errors] 
end

Class: Rsssf::Parser

Defined Under Namespace

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#log(msg) ⇒ Object

#parse(line, debug: false) ⇒ Object

#parse_with_errors(line, debug: false) ⇒ Object

#tokenize(line, debug: false) ⇒ Object

#tokenize_with_errors(line, debug: false) ⇒ Object

#log(msg) ⇒ `Object`

#parse(line, debug: false) ⇒ `Object`

#parse_with_errors(line, debug: false) ⇒ `Object`

#tokenize(line, debug: false) ⇒ `Object`

#tokenize_with_errors(line, debug: false) ⇒ `Object`