Class: Govspeak::StructuredHeaderExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/govspeak/structured_header_extractor.rb

Instance Method Summary collapse

Constructor Details

#initialize(document) ⇒ StructuredHeaderExtractor

Returns a new instance of StructuredHeaderExtractor.



19
20
21
22
23
# File 'lib/govspeak/structured_header_extractor.rb', line 19

def initialize(document)
  @doc = document
  @structured_headers = []
  reset_stack
end

Instance Method Details

#add_child(header) ⇒ Object



63
64
65
# File 'lib/govspeak/structured_header_extractor.rb', line 63

def add_child(header)
  stack.last.headers << header
end

#add_sibling(header) ⇒ Object



58
59
60
61
# File 'lib/govspeak/structured_header_extractor.rb', line 58

def add_sibling(header)
  stack.pop
  stack.last.headers << header
end

#add_top_level(header) ⇒ Object



53
54
55
56
# File 'lib/govspeak/structured_header_extractor.rb', line 53

def add_top_level(header)
  structured_headers.push(header)
  reset_stack
end

#add_uncle_or_aunt(header) ⇒ Object



67
68
69
70
# File 'lib/govspeak/structured_header_extractor.rb', line 67

def add_uncle_or_aunt(header)
  pop_stack_to_level(header)
  stack.last.headers << header
end

#callObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/govspeak/structured_header_extractor.rb', line 25

def call
  headers_list.each do |header|
    next if header_higher_than_top_level?(header)

    if header.top_level?
      add_top_level(header)
    elsif header_at_same_level_as_prev?(header)
      add_sibling(header)
    elsif header_one_level_lower_than_prev?(header)
      add_child(header)
    elsif header_at_higher_level_than_prev?(header)
      add_uncle_or_aunt(header)
    else
      next # ignore semantically invalid headers
    end

    stack.push(header)
  end

  structured_headers
end

#header_at_higher_level_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


85
86
87
88
# File 'lib/govspeak/structured_header_extractor.rb', line 85

def header_at_higher_level_than_prev?(header)
  # higher level means level integer is lower
  stack.last && (stack.last.level > header.level)
end

#header_at_same_level_as_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


76
77
78
# File 'lib/govspeak/structured_header_extractor.rb', line 76

def header_at_same_level_as_prev?(header)
  stack.last && stack.last.level == header.level
end

#header_higher_than_top_level?(header) ⇒ Boolean

Returns:

  • (Boolean)


72
73
74
# File 'lib/govspeak/structured_header_extractor.rb', line 72

def header_higher_than_top_level?(header)
  header.level < header.top_level
end

#header_one_level_lower_than_prev?(header) ⇒ Boolean

Returns:

  • (Boolean)


80
81
82
83
# File 'lib/govspeak/structured_header_extractor.rb', line 80

def header_one_level_lower_than_prev?(header)
  # lower level means level integer is higher
  stack.last && (stack.last.level - header.level == -1)
end

#headers_listObject



47
48
49
50
51
# File 'lib/govspeak/structured_header_extractor.rb', line 47

def headers_list
  @headers_list ||= doc.headers.map do |h|
    StructuredHeader.new(h.text, h.level, h.id, [])
  end
end

#pop_stack_to_level(header) ⇒ Object



90
91
92
93
# File 'lib/govspeak/structured_header_extractor.rb', line 90

def pop_stack_to_level(header)
  times_to_pop = stack.last.level - header.level + 1
  times_to_pop.times { stack.pop }
end

#reset_stackObject



95
96
97
# File 'lib/govspeak/structured_header_extractor.rb', line 95

def reset_stack
  @stack = []
end