Class: Chomchom::Scorer

Inherits:
Object
  • Object
show all
Defined in:
lib/chomchom/scorer.rb

Instance Method Summary collapse

Instance Method Details

#score(text, summary, topics, dictionary, parse_options) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/chomchom/scorer.rb', line 7

def score(text, summary, topics, dictionary, parse_options)
  #solve the utf-8 invalid string error
  ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
  text = ic.iconv(text + ' ')[0..-2]
  summary = ic.iconv(summary + ' ')[0..-2]
  
  #step 1: prep the texts for analysis
  stemmer = Lingua::Stemmer.new(:language => 'en')
  
  stem_topics = topics.map { |t| stemmer.stem(t) }
  
  text_sentences = text.downcase.split_sentences
  tss = text_sentences.map { |ts|
    #stemmer.stem(ts) if ts.match(/\p{Word}+/)
    words = ts.downcase.split(/[^\p{Word}]/).map { |w| stemmer.stem(w) if w and w.size>1 and !w.is_common?}.compact
    words if ts.match(/\p{Word}+/) and words.size > 0
  }.compact
  
  #rudimentary sentences scoring (number of non-common words)
  #another scoring approach is to manually go throu each sentence and mark important ones
  #do they have an identifiable pattern (have topic and some other words?)
  #or first and last paragraphs are important? first sentence in paragraph?
  tss_scores = tss.map { |ts| ts.uniq.size }
  
  #File.open("fulltexts/#{title}.txt", "w") do |f|
  #text_sentences.map {|ts| ts if ts.match(/\p{Word}+/)}.compact.each_with_index { |ts,i| f.puts "#{i} #{ts}" }
  #end 
  
  #step 2: coverage analysis by performing exact word match (with stemming)
  
  #evaluate the whole summary, this will more likely increase the score
  #coverages = find_coverages(summary, ts)
  
  #separating by sentences has the effect of designating each sentence to a section
  coverages = []
  copy_taxes = []
  improper_grammar = 0 #number of chars violating the grammar rule
  #ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s|
  ss = summary.downcase.split_sentences.each { |s|
    #take a flat 30% for every copied sentence
    copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,''))
    
    s_grammar = s.gsub(/[\"\|\(\)\[\]\{\}\<\>]/,',') #replace by comma b/c link-grammar is bad with sentence containing quotes
    improper_grammar += s.size if GrammarCop::Sentence.count_linkages(s_grammar, dictionary, parse_options) == 0
    coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/)
  }
  
  #step 3: synonym analysis and domain specific fusion on words that didn't match
  #since the matched one already established, it's less likely that a word carries double meanings in the same story
  
  #step 4: compute coverage score
  covered = coverages.flatten.uniq

  #redundancy = coverages.flatten.size - coverages.flatten.uniq.size
  #uncovered = (0...ts.size).to_a.select { |i| i if !covered.delete(i) }
  
  #this treats every uncommon word as 1 unit
  total_score = tss_scores.inject { |sum, score| sum + score }
  summary_score = covered.inject { |sum, i| sum + tss_scores[i] }
  #puts "#{total_score} #{tss_scores}"
  #puts "#{summary_score} #{covered.map{|i| tss_scores[i]}}"
  
  #this treats every sentence as 1 unit (all sentences created equal)
  #puts "#{covered.size.to_f/tss.size*100}"
  
  #average tax for all sentences
  copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0
  
  #calculate grammar tax
  #grammar_tax = grammar_tax(proper_sentences, ss.size)
  
  #take 30% of the improper portion of the 
  grammar_tax = improper_grammar/summary.size*0.30
  #puts "grammar tax=#{grammar_tax}"
  
  #punish for length with the idea of length_tax, no tax below 100 and then progressively increase
  tax = length_tax(summary.size) + copy_tax + grammar_tax
  #puts "total tax =#{tax}"
  summary_score.to_f/total_score*100*(1-tax)
  
  #algo weaknesses:
  #extracted passage from text often scores higher (b/c of exact word matches)
  #people listing most occurred words in every sentence. check for proper grammar and coherence?
  #negation: take a high scoring summary, say the same thing but negate its meaning? check for meaning?
end