Module: PdfExtract::Language

Defined in:
lib/language.rb

Class Method Summary collapse

Class Method Details

.cap_ratio(s) ⇒ Object

TODO Ignore caps in middle of words



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# File 'lib/language.rb', line 45

def self.cap_ratio s
  sentence_end = true
  cap_count = 0

  s.each_char do |c|
    if c =~ /\./
      sentence_end = true
    elsif c =~ /[A-Z]/
      cap_count = cap_count + 1 unless sentence_end
      sentence_end = false
    elsif c =~ /[^\s]/
      sentence_end = false
    end
  end

  cap_count / s.split.length.to_f
end

.letter_ratio(s) ⇒ Object



40
41
42
# File 'lib/language.rb', line 40

def self.letter_ratio s
  s.count("A-Z0-9\-[],.\"'()") / s.length.to_f
end

.name_ratio(content) ⇒ Object



73
74
75
# File 'lib/language.rb', line 73

def self.name_ratio content
  PdfExtract::Names.detect_names(content)[:name_frequency]
end

.transliterate(s) ⇒ Object



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/language.rb', line 5

def self.transliterate s
  r = ""

  s.each_char do |c|
    case c

    # Remove ligatures
    when "\ufb00" then r << "ff"
    when "\ufb01" then r << "fi"
    when "\ufb02" then r << "fl"
    when "\ufb03" then r << "ffi"
    when "\ufb04" then r << "ffl"
    when "\ufb05" then r << "ft"
    when "\ufb06" then r << "st"
    when "\u1d6b" then r << "ue"

    # Normalise some punctuation.
    when "\u2018" then r << "'"
    when "\u2019" then r << "'"
    when "\u2013" then r << "-"
    when "\u2014" then r << "-"
    when "\u201c" then r << "\""
    when "\u201d" then r << "\""
    when "\u25af" then r << "("
    when "\u00b4" then r << ""
    when "\u00b1" then r << "-"

    else
      r << c
    end
  end

  r.gsub /\s+/, " "
end

.word_count(s) ⇒ Object



77
78
79
# File 'lib/language.rb', line 77

def self.word_count s
  s.split.count
end

.year_ratio(s) ⇒ Object



63
64
65
66
67
68
69
70
71
# File 'lib/language.rb', line 63

def self.year_ratio s
  words = s.split

  year_words = words.map do |word|
    word =~ /[^\d]\d{4}[^\d]/
  end

  year_words.reject { |year_word| not year_word }.length / words.length.to_f
end