Module: PdfExtract::Sections

Defined in:
lib/analysis/sections.rb

Class Method Summary collapse

Class Method Details

.add_content_stats(sections, page_count) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/analysis/sections.rb', line 72

def self.add_content_stats sections, page_count
  sections.map do |section|
    last_page = section[:components].max {|c| c[:page]}[:page]
    content = Spatial.get_text_content section
    Spatial.drop_spatial(section).merge({
      :letter_ratio => Language.letter_ratio(content),
      :year_ratio => Language.year_ratio(content),
      :cap_ratio => Language.cap_ratio(content),
      :name_ratio => Language.name_ratio(content),
      :word_count => Language.word_count(content),
      :lateness => (last_page / page_count.to_f)
    })
  end
end

.candidate?(pdf, region, column) ⇒ Boolean

Returns:

  • (Boolean)


24
25
26
27
28
29
30
31
32
# File 'lib/analysis/sections.rb', line 24

def self.candidate? pdf, region, column
  # Regions that make up sections or headers must be
  # both less wide than their column width and,
  # unless they are a single line, must be within the
  # width_ratio.
  width_ratio = pdf.settings[:width_ratio]
  within_column = region[:width] <= column[:width]
  within_column && (region[:width].to_f / column[:width]) >= width_ratio
end

.clusters_to_spatials(clusters) ⇒ Object



62
63
64
65
66
67
68
69
70
# File 'lib/analysis/sections.rb', line 62

def self.clusters_to_spatials clusters
  clusters.map do |cluster|
    cluster[:items].each do |item|
      centre = cluster[:centre].values.map { |v| v.round(3) }.join ", "
      item[:centre] = centre
    end
    cluster[:items]
  end.flatten
end

.include_in(pdf) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
# File 'lib/analysis/sections.rb', line 87

def self.include_in pdf
  pdf.spatials :sections, :depends_on => [:regions, :columns] do |parser|

    columns = []

    parser.objects :columns do |column|
       columns << {:column => column, :regions => []}
    end

    parser.objects :regions do |region|
      containers = columns.reject do |c|
        column = c[:column]
        not (column[:page] == region[:page] && Spatial.contains?(column, region, 1))
      end

      containers.first[:regions] << region unless containers.empty?
    end

    parser.after do
      # Sort regions in each column from highest to lowest.
      columns.each do |c|
        c[:regions].sort_by! { |r| -r[:y] }
      end

      # Group columns into pages.
      pages = {}
      columns.each do |c|
        pages[c[:column][:page]] ||= []
        pages[c[:column][:page]] << c
      end

      # Sort bodies on each page from x left to right.
      pages.each_pair do |page, columns|
        columns.sort_by! { |c| c[:column][:x] }
      end

      sections = []
      merging_region = nil

      pages.each_pair do |page, columns|
        columns.each do |container|
          column = container[:column]

          container[:regions].each do |region|
            if candidate? pdf, region, column
              if !merging_region.nil? && match?(merging_region, region)
                content = Spatial.merge_lines(merging_region, region, {})

                merging_region.merge!(content)

                merging_region[:components] << Spatial.get_dimensions(region)
              elsif !merging_region.nil?
                sections << merging_region
                merging_region = region.merge({
                  :components => [Spatial.get_dimensions(region)]
                })
              else
                merging_region = region.merge({
                  :components => [Spatial.get_dimensions(region)]
                })
              end
            elsif possible_header? pdf, region, column
              # Split sections, ignore the header
              sections << merging_region if !merging_region.nil?
              merging_region = nil
            end
          end
        end
      end

      sections <<  merging_region if not merging_region.nil?

      # We now have sections. Add information to them.
      # add_content_types sections
      sections = add_content_stats sections, pages.keys.count

      # Score sections into categories based on their textual attributes.
      ref_ideals = {
        :name_ratio => [0.14, 1],
        :letter_ratio => [0.23, 6],
        :year_ratio => [0.05, 10],
        :cap_ratio => [0.49, 10],
        :lateness => [0.96, 6]
      }

      Spatial.score(sections, ref_ideals, :reference_score)

      sections
    end

  end
end

.match?(a, b) ⇒ Boolean

Returns:

  • (Boolean)


15
16
17
18
19
20
21
22
# File 'lib/analysis/sections.rb', line 15

def self.match? a, b
  # A must have a width around the width of B and have the same
  # font size.
  avg_width = (a[:width] + b[:width]) / 2.0
  matched_width = (a[:width] - b[:width]).abs <= avg_width * 0.1
  matched_font_size = a[:line_height].round(2) == b[:line_height].round(2)
  matched_width && matched_font_size
end

.possible_header?(pdf, region, column) ⇒ Boolean

Returns:

  • (Boolean)


34
35
36
37
38
39
40
41
42
# File 'lib/analysis/sections.rb', line 34

def self.possible_header? pdf, region, column
  # Possible headers are narrower than the column width_ratio
  # but still within the column bounds. They must also be at least
  # as wide as they are tall (otherwise we may have a table
  # column, which should be ignored for purposes of determing
  # page flow).
  within_column = region[:width] <= column[:width]
  within_column && (region[:width] >= region[:height])
end

.reference_cluster(clusters) ⇒ Object



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/analysis/sections.rb', line 44

def self.reference_cluster clusters
  # Find the cluster with name_ratio closest to 0.1
  # Those are our reference sections.
  ideal = 0.1
  ref_cluster = nil
  smallest_diff = 1

  clusters.each do |cluster|
    diff = (cluster[:centre][:name_ratio] - ideal).abs
    if diff < smallest_diff
      ref_cluster = cluster
      smallest_diff = diff
    end
  end

  ref_cluster
end