Module: PdfExtract::Language
- Defined in:
- lib/language.rb
Class Method Summary collapse
-
.cap_ratio(s) ⇒ Object
TODO Ignore caps in middle of words.
- .letter_ratio(s) ⇒ Object
- .name_ratio(content) ⇒ Object
- .transliterate(s) ⇒ Object
- .word_count(s) ⇒ Object
- .year_ratio(s) ⇒ Object
Class Method Details
.cap_ratio(s) ⇒ Object
TODO Ignore caps in middle of words
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/language.rb', line 45 def self.cap_ratio s sentence_end = true cap_count = 0 s.each_char do |c| if c =~ /\./ sentence_end = true elsif c =~ /[A-Z]/ cap_count = cap_count + 1 unless sentence_end sentence_end = false elsif c =~ /[^\s]/ sentence_end = false end end cap_count / s.split.length.to_f end |
.letter_ratio(s) ⇒ Object
40 41 42 |
# File 'lib/language.rb', line 40 def self.letter_ratio s s.count("A-Z0-9\-[],.\"'()") / s.length.to_f end |
.name_ratio(content) ⇒ Object
73 74 75 |
# File 'lib/language.rb', line 73 def self.name_ratio content PdfExtract::Names.detect_names(content)[:name_frequency] end |
.transliterate(s) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# File 'lib/language.rb', line 5 def self.transliterate s r = "" s.each_char do |c| case c # Remove ligatures when "\ufb00" then r << "ff" when "\ufb01" then r << "fi" when "\ufb02" then r << "fl" when "\ufb03" then r << "ffi" when "\ufb04" then r << "ffl" when "\ufb05" then r << "ft" when "\ufb06" then r << "st" when "\u1d6b" then r << "ue" # Normalise some punctuation. when "\u2018" then r << "'" when "\u2019" then r << "'" when "\u2013" then r << "-" when "\u2014" then r << "-" when "\u201c" then r << "\"" when "\u201d" then r << "\"" when "\u25af" then r << "(" when "\u00b4" then r << "" when "\u00b1" then r << "-" else r << c end end r.gsub /\s+/, " " end |
.word_count(s) ⇒ Object
77 78 79 |
# File 'lib/language.rb', line 77 def self.word_count s s.split.count end |
.year_ratio(s) ⇒ Object
63 64 65 66 67 68 69 70 71 |
# File 'lib/language.rb', line 63 def self.year_ratio s words = s.split year_words = words.map do |word| word =~ /[^\d]\d{4}[^\d]/ end year_words.reject { |year_word| not year_word }.length / words.length.to_f end |