Class: Wikiscript::OutlineReader

Inherits:
Object
  • Object
show all
Defined in:
lib/wikiscript/outline_reader.rb

Constant Summary collapse

HEADING_RE =
%r{\A
(?<marker>={1,})       ## 1. leading ======
  [ ]*
(?<text>[^=]+?)        ## 2. text  (note: for now no "inline" = allowed)
  [ ]*
  =*                   ## 3. (optional) trailing ====
\z}x

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(txt) ⇒ OutlineReader

Returns a new instance of OutlineReader.



15
16
17
# File 'lib/wikiscript/outline_reader.rb', line 15

def initialize( txt )
  @txt = txt
end

Class Method Details

.parse(txt) ⇒ Object



11
12
13
# File 'lib/wikiscript/outline_reader.rb', line 11

def self.parse( txt )
  new( txt ).parse
end

.read(path) ⇒ Object



6
7
8
9
# File 'lib/wikiscript/outline_reader.rb', line 6

def self.read( path )
  txt = File.open( path, 'r:utf-8' ) { |f| f.read }
  parse( txt )
end

Instance Method Details

#parseObject



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/wikiscript/outline_reader.rb', line 28

def parse
  outline = []   ## outline / page structure

  start_para = true      ## start new para(graph) on new text line?

  @txt.each_line do |line|

##
##  (auto-)sanitize first
##    -  &nbsp;   =>  "vanilla" space
##    -  1–2      => 1-2  - "vanilla" dash
##    todo - move up into txt!!!
      line = line.gsub( '&nbsp;', ' ' )
      line = line.gsub( /[–]/, '-' )


      line = line.strip      ## todo/fix: keep leading and trailing spaces - why? why not?

      if line.empty?    ## todo/fix: keep blank line nodes?? and just remove comments and process headings?! - why? why not?
        start_para = true
        next
      end

      break if line == '__END__'

      next if line.start_with?( '#' )   ## skip comments too
      ## strip inline (until end-of-line) comments too
      ##  e.g Eupen | KAS Eupen ## [de]
      ##   => Eupen | KAS Eupen
      ##  e.g bq   Bonaire,  BOE        # CONCACAF
      ##   => bq   Bonaire,  BOE
      line = line.sub( /#.*/, '' ).strip

       ## note: like in wikimedia markup (and markdown) all optional trailing ==== too
      if m=HEADING_RE.match( line )
         start_para = true

         heading_marker = m[:marker]
         heading_level  = heading_marker.length   ## count number of = for heading level
         heading        = m[:text].strip

         outline << [:"h#{heading_level}", heading]
      elsif line == '----'   ## make more generic/flexible - why? why not?
          start_para = true
          outline << [:hr]
 ## The horizontal rule represents a paragraph-level thematic break. Do not use in article content,
 ##  as rules are used only after main sections, and this is automatic.
 ## HTML equivalent: <hr /> (which can be indented,
 ##  whereas ---- always starts at the left margin.)
      else    ## assume it's a (plain/regular) text line
         if start_para
           outline << [:p, [line]]
           start_para = false
         else
           node = outline[-1]    ## get last entry
           if node[0] == :p      ##  assert it's a p(aragraph) node!!!
              node[1] << line    ## add line to p(aragraph)
           else
             puts "!! ERROR - invalid outline state / format - expected p(aragraph) node; got:"
             pp node
             exit 1
           end
         end
      end
  end
  outline
end