Class: Chomchom::Scorer
- Inherits:
-
Object
- Object
- Chomchom::Scorer
- Defined in:
- lib/chomchom/scorer.rb
Instance Method Summary collapse
Instance Method Details
#score(text, summary, topics, dictionary, parse_options) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/chomchom/scorer.rb', line 7 def score(text, summary, topics, dictionary, ) #solve the utf-8 invalid string error ic = Iconv.new('UTF-8//IGNORE', 'UTF-8') text = ic.iconv(text + ' ')[0..-2] summary = ic.iconv(summary + ' ')[0..-2] #step 1: prep the texts for analysis stemmer = Lingua::Stemmer.new(:language => 'en') stem_topics = topics.map { |t| stemmer.stem(t) } text_sentences = text.downcase.split_sentences tss = text_sentences.map { |ts| #stemmer.stem(ts) if ts.match(/\p{Word}+/) words = ts.downcase.split(/[^\p{Word}]/).map { |w| stemmer.stem(w) if w and w.size>1 and !w.is_common?}.compact words if ts.match(/\p{Word}+/) and words.size > 0 }.compact #rudimentary sentences scoring (number of non-common words) #another scoring approach is to manually go throu each sentence and mark important ones #do they have an identifiable pattern (have topic and some other words?) #or first and last paragraphs are important? first sentence in paragraph? tss_scores = tss.map { |ts| ts.uniq.size } #File.open("fulltexts/#{title}.txt", "w") do |f| #text_sentences.map {|ts| ts if ts.match(/\p{Word}+/)}.compact.each_with_index { |ts,i| f.puts "#{i} #{ts}" } #end #step 2: coverage analysis by performing exact word match (with stemming) #evaluate the whole summary, this will more likely increase the score #coverages = find_coverages(summary, ts) #separating by sentences has the effect of designating each sentence to a section coverages = [] copy_taxes = [] improper_grammar = 0 #number of chars violating the grammar rule #ss = summary.downcase.split(/(?:\.+[^\p{Word}])|\n+/).each { |s| ss = summary.downcase.split_sentences.each { |s| #take a flat 30% for every copied sentence copy_taxes.push(0.3) if text.downcase.index(s.gsub(/[^\p{Word}]+$/,'').gsub(/^[^\p{Word}]+/,'')) s_grammar = s.gsub(/[\"\|\(\)\[\]\{\}\<\>]/,',') #replace by comma b/c link-grammar is bad with sentence containing quotes improper_grammar += s.size if GrammarCop::Sentence.count_linkages(s_grammar, dictionary, ) == 0 coverages.push(find_coverages(s, tss, stem_topics)) if s.match(/\p{Word}+/) } #step 3: synonym analysis and domain specific fusion on words that didn't match #since the matched one already established, it's less likely that a word carries double meanings in the same story #step 4: compute coverage score covered = coverages.flatten.uniq #redundancy = coverages.flatten.size - coverages.flatten.uniq.size #uncovered = (0...ts.size).to_a.select { |i| i if !covered.delete(i) } #this treats every uncommon word as 1 unit total_score = tss_scores.inject { |sum, score| sum + score } summary_score = covered.inject { |sum, i| sum + tss_scores[i] } #puts "#{total_score} #{tss_scores}" #puts "#{summary_score} #{covered.map{|i| tss_scores[i]}}" #this treats every sentence as 1 unit (all sentences created equal) #puts "#{covered.size.to_f/tss.size*100}" #average tax for all sentences copy_tax = (copy_taxes.size > 0)? (copy_taxes.inject { |sum, t| sum + t}/ss.size) : 0.0 #calculate grammar tax #grammar_tax = grammar_tax(proper_sentences, ss.size) #take 30% of the improper portion of the grammar_tax = improper_grammar/summary.size*0.30 #puts "grammar tax=#{grammar_tax}" #punish for length with the idea of length_tax, no tax below 100 and then progressively increase tax = length_tax(summary.size) + copy_tax + grammar_tax #puts "total tax =#{tax}" summary_score.to_f/total_score*100*(1-tax) #algo weaknesses: #extracted passage from text often scores higher (b/c of exact word matches) #people listing most occurred words in every sentence. check for proper grammar and coherence? #negation: take a high scoring summary, say the same thing but negate its meaning? check for meaning? end |