Module: PdfExtract::References
- Defined in:
- lib/references/references.rb
Class Method Summary collapse
- .frequencies(lines, delimit_key) ⇒ Object
- .include_in(pdf) ⇒ Object
- .multi_margin?(lines) ⇒ Boolean
- .multi_spacing?(lines) ⇒ Boolean
- .numeric_sequence?(pdf, content) ⇒ Boolean
- .partition_by(ary, &block) ⇒ Object
- .select_delimiter(lines, delimit_key) ⇒ Object
- .split_by_delimiter(pdf, s) ⇒ Object
- .split_by_line_spacing(lines) ⇒ Object
- .split_by_margin(lines) ⇒ Object
Class Method Details
.frequencies(lines, delimit_key) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/references/references.rb', line 46 def self.frequencies lines, delimit_key fs = {} lines.each do |line| val = line[delimit_key].floor fs[val] ||= 0 fs[val] = fs[val].next end ary = [] fs.each_pair do |key, val| ary << {:value => key, :count => val} end ary.sort_by { |item| item[:count] }.reverse end |
.include_in(pdf) ⇒ Object
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# File 'lib/references/references.rb', line 186 def self.include_in pdf pdf.spatials :references, :depends_on => [:sections] do |parser| sections = [] parser.objects :sections do |section| sections << section end parser.after do max_score = sections.map {|s| s[:reference_score]}.max min_permittable = max_score - (max_score * pdf.settings[:reference_flex]) refs = [] sections = sections.reject do |s| # A section without any years is definitely not a list of # references. So too a section that appears in the first # half of an article. s[:lateness] < pdf.settings[:min_lateness] || s[:year_ratio].zero? end sections.each do |section| if section[:reference_score] >= min_permittable # TODO Enable classification once we have a reasonable model. #if Score.reference?(section) content = Spatial.get_text_content(section) if numeric_sequence? pdf, content refs += split_by_delimiter pdf, content elsif multi_margin? section[:lines] refs += split_by_margin section[:lines] elsif multi_spacing? section[:lines] refs += split_by_line_spacing section[:lines] end end end # TODO Ideally we wouldn't see the ref headers here. # Unfortunately publication details can look a lot like references. refs.reject do |ref| norm = ref[:content].downcase.strip norm =~ /references?/ || norm =~ /submitted for publication/ || norm =~ /additional contributions/ end end end end |
.multi_margin?(lines) ⇒ Boolean
155 156 157 |
# File 'lib/references/references.rb', line 155 def self.multi_margin? lines lines.uniq { |line| line[:x_offset].floor }.count > 1 end |
.multi_spacing?(lines) ⇒ Boolean
159 160 161 |
# File 'lib/references/references.rb', line 159 def self.multi_spacing? lines lines.uniq { |line| line[:spacing].floor }.count > 1 end |
.numeric_sequence?(pdf, content) ⇒ Boolean
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/references/references.rb', line 163 def self.numeric_sequence? pdf, content last_n = -1 first_n = -1 seq_count = 0 content.scan /\d+/ do |m| # Avoid misinterpreting years as sequence if m.to_i < pdf.settings[:max_reference_order] if last_n == -1 last_n = m.to_i first_n = m.to_i if first_n == -1 elsif last_n.next == m.to_i last_n = last_n.next seq_count = seq_count.next end end end # Sequence must be long enough and first number of sequence # must appear near the very start of content. large_enough = seq_count >= pdf.settings[:min_sequence_count] large_enough && content[0..30] =~ /#{first_n.to_s}/ end |
.partition_by(ary, &block) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/references/references.rb', line 32 def self.partition_by ary, &block matching = [] parts = [] ary.each do |item| if yield(item) parts << matching matching = [] end matching << item end parts << matching parts.reject { |p| p.empty? } end |
.select_delimiter(lines, delimit_key) ⇒ Object
62 63 64 |
# File 'lib/references/references.rb', line 62 def self.select_delimiter lines, delimit_key frequencies(lines, delimit_key)[1][:value] end |
.split_by_delimiter(pdf, s) ⇒ Object
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/references/references.rb', line 80 def self.split_by_delimiter pdf, s # Find sequential numbers and use them as partition points. # Determine the charcaters that are most likely part of numeric # delimiters. after = {} before = {} last_n = -1 s.scan /[^\d]?\d+[^\d]/ do |m| n = m[/\d+/].to_i if n < pdf.settings[:max_reference_order] if last_n == -1 before[m[0]] ||= 0 before[m[0]] = before[m[0]].next after[m[-1]] ||= 0 after[m[-1]] = after[m[-1]].next last_n = n elsif n == last_n.next before[m[0]] ||= 0 before[m[0]] = before[m[0]].next after[m[-1]] ||= 0 after[m[-1]] = after[m[-1]].next last_n = last_n.next end end end b_s = "" if before.length.zero? b_s = "\\" + before.max_by { |_, v| v }[0] unless before.length.zero? a_s = "" if after.length.zero? a_s = "\\" + after.max_by { |_, v| v }[0] unless after.length.zero? # TODO Turn into settings. Needs typed settings if ["", "\\[", "\\ "].include?(b_s) && ["", "\\.", "\\]", "\\ "].include?(a_s) # Split by the delimiters and record separate refs. last_n = -1 current_ref = "" refs = [] parts = s.partition(Regexp.new "#{b_s}?\\d+#{a_s}") while not parts[1].length.zero? n = parts[1][/\d+/].to_i if n < pdf.settings[:max_reference_order] && last_n == -1 last_n = n elsif n == last_n.next current_ref += parts[0] refs << { :content => current_ref.strip, :order => last_n } current_ref = "" last_n = last_n.next else current_ref += parts[0] + parts[1] end parts = parts[2].partition(Regexp.new "#{b_s}?\\d+#{a_s}") end refs << { :content => (current_ref + parts[0]).strip, :order => last_n } refs else [] end end |
.split_by_line_spacing(lines) ⇒ Object
73 74 75 76 77 78 |
# File 'lib/references/references.rb', line 73 def self.split_by_line_spacing lines delimiting_spacing = select_delimiter lines, :spacing lines = lines.drop_while { |l| l[:spacing].floor != delimiting_spacing } parts = partition_by(lines) { |line| line[:spacing].floor == delimiting_spacing } parts.map { |part| {:content => part.map { |line| line[:content] }.join(" ")} } end |
.split_by_margin(lines) ⇒ Object
66 67 68 69 70 71 |
# File 'lib/references/references.rb', line 66 def self.split_by_margin lines delimiting_x_offset = select_delimiter lines, :x_offset lines = lines.drop_while { |l| l[:x_offset].floor != delimiting_x_offset } parts = partition_by(lines) { |line| line[:x_offset].floor == delimiting_x_offset } parts.map { |part| {:content => part.map { |line| line[:content] }.join(" ")} } end |