Class: RubyTfIdf::TfIdf
- Inherits:
-
Object
- Object
- RubyTfIdf::TfIdf
- Defined in:
- lib/ruby-tf-idf.rb
Constant Summary collapse
- STOP_WORDS_EN =
[ 'a','cannot','into','our','thus','about','co','is','ours','to','above', 'could','it','ourselves','together','across','down','its','out','too', 'after','during','itself','over','toward','afterwards','each','last','own', 'towards','again','eg','latter','per','under','against','either','latterly', 'perhaps','until','all','else','least','rather','up','almost','elsewhere', 'less','same','upon','alone','enough','ltd','seem','us','along','etc', 'many','seemed','very','already','even','may','seeming','via','also','ever', 'me','seems','was','although','every','meanwhile','several','we','always', 'everyone','might','she','well','among','everything','more','should','were', 'amongst','everywhere','moreover','since','what','an','except','most','so', 'whatever','and','few','mostly','some','when','another','first','much', 'somehow','whence','any','for','must','someone','whenever','anyhow', 'former','my','something','where','anyone','formerly','myself','sometime', 'whereafter','anything','from','namely','sometimes','whereas','anywhere', 'further','neither','somewhere','whereby','are','had','never','still', 'wherein','around','has','nevertheless','such','whereupon','as','have', 'next','than','wherever','at','he','no','that','whether','be','hence', 'nobody','the','whither','became','her','none','their','which','because', 'here','noone','them','while','become','hereafter','nor','themselves','who', 'becomes','hereby','not','then','whoever','becoming','herein','nothing', 'thence','whole','been','hereupon','now','there','whom','before','hers', 'nowhere','thereafter','whose','beforehand','herself','of','thereby','why', 'behind','him','off','therefore','will','being','himself','often','therein', 'with','below','his','on','thereupon','within','beside','how','once', 'these','without','besides','however','one','they','would','between','i', 'only','this','yet','beyond','ie','onto','those','you','both','if','or', 'though','your','but','in','other','through','yours','by','inc','others', 'throughout','yourself','can','indeed','otherwise','thru','yourselves' ]
- STOP_WORDS_FR =
[ '-elle','-il','à','a','afin','ai','ainsi','ais','ait','alors','après','as','assez','au','aucun', 'aucune','auprès','auquel','auquelles','auquels','auraient','aurais','aurait','aurez', 'auriez','aurions','aurons','auront','aussi','aussitôt','autre','autres','aux', 'avaient','avais','avait','avant','avec','avez','aviez','avoir','avons','ayant', 'beaucoup','c','car','ce','ceci','cela','celle','celles','celui','cependant', 'certes','ces','cet','cette','ceux','chacun','chacune','chaque','chez','cinq', 'comme','d','abord','dans','de','dehors','delà','depuis','des','dessous', 'dessus','deux','deça','dix','doit','donc','dont','du','durant','dès','déjà', 'elle','elles','en','encore','enfin','entre','er','est','est-ce','et','etc', 'eu','eurent','eut','faut','fur','hormis','hors','huit','il','ils','j','je', 'jusqu','l','la','laquelle','le','lequel','les','lesquels','leur','leurs', 'lors','lorsque','lui','là','m','mais','malgré','me','melle','mes','mm','mme', 'moi','moins','mon','mr','même','mêmes','n','neuf','ni','non-','nos','notamment', 'notre','nous','néanmoins','nôtres','on','ont','ou','où','par','parce','parfois', 'parmi','partout','pas','pendant','peu','peut','peut-être','plus','plutôt','pour', 'pourquoi','près','puisqu','puisque','qu','quand','quant','quatre','que','quel', 'quelle','quelles','quelqu','quelque','quelquefois','quelques','quels','qui', 'quoi','quot','s','sa','sans','se','sept','sera','serai','seraient','serais', 'serait','seras','serez','seriez','serions','serons','seront','ses','si','sien', 'siennes','siens','sitôt','six','soi','sommes','son','sont','sous','souvent', 'suis','sur','t','toi','ton','toujours','tous','tout','toutefois','toutes', 'troiw','tu','un','une','unes','uns','voici','voilà','vos','votre','vous','vôtres', 'y','à','ème','étaient','étais','était','étant','étiez','étions','êtes','être', 'afin','ainsi','alors','après','aucun','aucune','auprès','auquel','aussi','autant', 'aux','avec','car','ceci','cela','celle','celles','celui','cependant','ces', 'cet','cette','ceux','chacun','chacune','chaque','chez','comme','comment','dans', 'des','donc','donné','dont','duquel','dès','déjà','elle','elles','encore','entre', 'étant','etc','été','eux','furent','grâce','hors','ici','ils','jusqu','les','leur', 'leurs','lors','lui','mais','malgré','mes','mien','mienne','miennes','miens', 'moins','moment','mon','même','mêmes','non','nos','notre','notres','nous','notre', 'oui','par','parce','parmi','plus','pour','près','puis','puisque','quand','quant', 'que','quel','quelle','quelque','quelquun','quelques','quels','qui','quoi','sans', 'sauf','selon','ses','sien','sienne','siennes','siens','soi','soit','sont','sous', 'suis','sur','tandis','tant','tes','tienne','tiennes','tiens','toi','ton','tous', 'tout','toute','toutes','trop','très','une','vos','votre','vous','étaient','était', 'étant','être' ]
Instance Attribute Summary collapse
-
#idf ⇒ Object
Returns the value of attribute idf.
-
#tf ⇒ Object
Returns the value of attribute tf.
-
#tf_idf ⇒ Object
Returns the value of attribute tf_idf.
Instance Method Summary collapse
- #compute_tf_and_idf ⇒ Object
- #compute_tf_idf(limit, exlude_stop_words) ⇒ Object
-
#initialize(docs, limit, exclude_stop_words) ⇒ TfIdf
constructor
A new instance of TfIdf.
- #split_docs(docs) ⇒ Object
Constructor Details
#initialize(docs, limit, exclude_stop_words) ⇒ TfIdf
Returns a new instance of TfIdf.
83 84 85 86 87 88 89 90 91 92 93 |
# File 'lib/ruby-tf-idf.rb', line 83 def initialize(docs, limit, exclude_stop_words) @docs = split_docs(docs) @tf = [] @idf = {} @tf_idf = [] @docs_size = @docs.size compute_tf_and_idf compute_tf_idf(limit,exclude_stop_words) end |
Instance Attribute Details
#idf ⇒ Object
Returns the value of attribute idf.
81 82 83 |
# File 'lib/ruby-tf-idf.rb', line 81 def idf @idf end |
#tf ⇒ Object
Returns the value of attribute tf.
81 82 83 |
# File 'lib/ruby-tf-idf.rb', line 81 def tf @tf end |
#tf_idf ⇒ Object
Returns the value of attribute tf_idf.
81 82 83 |
# File 'lib/ruby-tf-idf.rb', line 81 def tf_idf @tf_idf end |
Instance Method Details
#compute_tf_and_idf ⇒ Object
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/ruby-tf-idf.rb', line 108 def compute_tf_and_idf @docs.each do |words| terms_freq_in_words = words.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h } @tf.push(terms_freq_in_words) distinct_words = words.uniq distinct_words.each do |w| if ( @idf.has_key?(w) ) y = @docs_size / ( 10**(@idf[w]) ) y += 1 @idf[w] = Math.log10(@docs_size / y) else @idf[w] = Math.log10(@docs_size) end end end def compute_tf_idf(limit,exlude_stop_words) @tf.each do |tf_freq| tfidf = Hash.new(0) tf_freq.each do |key,value| tfidf[key] = @idf[key] * value end if (exlude_stop_words == true) tfidf.reject!{ |k| STOP_WORDS_FR.include?(k) == true } tfidf.reject!{ |k| STOP_WORDS_EN.include?(k) == true } end tfidf = Hash[tfidf.sort_by { |k,v| -v }[0..limit-1]] @tf_idf.push(tfidf) end end end |
#compute_tf_idf(limit, exlude_stop_words) ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/ruby-tf-idf.rb', line 127 def compute_tf_idf(limit,exlude_stop_words) @tf.each do |tf_freq| tfidf = Hash.new(0) tf_freq.each do |key,value| tfidf[key] = @idf[key] * value end if (exlude_stop_words == true) tfidf.reject!{ |k| STOP_WORDS_FR.include?(k) == true } tfidf.reject!{ |k| STOP_WORDS_EN.include?(k) == true } end tfidf = Hash[tfidf.sort_by { |k,v| -v }[0..limit-1]] @tf_idf.push(tfidf) end end |
#split_docs(docs) ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/ruby-tf-idf.rb', line 95 def split_docs(docs) splitted_docs = [] docs.each do |d| begin splitted_docs << d.downcase!.gsub(/,|\.|\'/,'').split(/\s+/) rescue end end splitted_docs end |