Class: Yawc
- Inherits:
-
Object
- Object
- Yawc
- Defined in:
- lib/yawc.rb
Instance Attribute Summary collapse
-
#to_h ⇒ Object
readonly
Returns the value of attribute to_h.
Instance Method Summary collapse
-
#initialize(s, level: 2) ⇒ Yawc
constructor
level: 2 strips out ignore_words and stop_words 3 strips out dictionary words.
- #words(s) ⇒ Object
Constructor Details
#initialize(s, level: 2) ⇒ Yawc
level:
2 strips out ignore_words and stop_words
3 strips out dictionary words
695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 |
# File 'lib/yawc.rb', line 695 def initialize(s, level: 2) @stopwords = STOPWORDS.strip.lines.map {|x| x.chomp} a = case level when 2 words(s).split when 3 list = words(s).split list - WordsDotDat.words end h = a.group_by(&:to_s).\ inject({}){|r, x| r.merge(String.new(x[0]).\ force_encoding("utf-8") => x[-1].length)} @to_h = h.sort_by(&:last).reverse.to_h end |
Instance Attribute Details
#to_h ⇒ Object (readonly)
Returns the value of attribute to_h.
689 690 691 |
# File 'lib/yawc.rb', line 689 def to_h @to_h end |
Instance Method Details
#words(s) ⇒ Object
715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 |
# File 'lib/yawc.rb', line 715 def words(s) # words source: http://norvig.com/mayzner.html ignorewords = ["the", "of", "and", "to", "in", "a", "is", "that", "for", "it", "as", "was", "with", "be", "by", "on", "not", "he", "i", "this", "are", "or","his", "from", "at", "which", "but", "have", "an", "had", "they", "you", "were", "their", "one", "all", "we", "can", "her", "has", "there", "been", "if", "more", "when", "will", "would", "who", "so", "no"] s.downcase. gsub(/\w+'\w+/,''). # remove words containing an apostrophe gsub(/["']/,''). # remove quotation marks gsub(/\W[^a-z|#]+(\w+)/,' \1 '). # remove # non-alpabetical characters from start or beginning of words gsub(/\s.\s/,' '). # remove single digits gsub(/\b(?:#{(ignorewords + @stopwords).join('|')})\b/,''). # ignore common words gsub(/\B[^\w#]\B+/,'') # remove any other items which are # not words or hashtags end |