Class: TableExtractor

Inherits:
Object show all
Defined in:
lib/table_extractor.rb

Class Method Summary collapse

Class Method Details

.extract_tables(lines, regexp:) ⇒ Array<Hash>

Extract tables from an array of text lines formatted in Markdown style

Parameters:

Returns:

  • (Array<Hash>)

    An array of tables with row count, column count, and start index



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/table_extractor.rb', line 8

def self.extract_tables(lines, regexp:)
  tables = []
  inside_table = false
  table_start = nil
  row_count = 0
  column_count = 0

  lines.each_with_index do |line, index|
    # Match line separators with at least 2 columns
    if line.strip.match?(regexp)
      if inside_table
        # Add the current table before starting a new one
        tables << {
          rows: row_count,
          columns: column_count,
          start_index: table_start
        }
      end
      # Start a new table
      table_start = index - 1 if table_start.nil?
      column_count = line.split('|').count - 1
      row_count = 2 # Reset to 2 to account for the header and separator rows
      inside_table = true
    elsif inside_table && (line.strip.start_with?('|') || line.include?('|'))
      row_count += 1
    elsif inside_table
      # Add the current table and reset the state
      tables << {
        rows: row_count,
        columns: column_count,
        start_index: table_start
      }
      inside_table = false
      table_start = nil
      row_count = 0
      column_count = 0
    end
  end

  # Handle case where table ends at the last line
  if inside_table
    tables << {
      rows: row_count,
      columns: column_count,
      start_index: table_start
    }
  end

  tables
end