Class: Matlock
- Inherits:
-
Object
- Object
- Matlock
- Defined in:
- lib/matlock.rb,
lib/matlock/data.rb,
lib/matlock/version.rb
Defined Under Namespace
Classes: Data
Constant Summary collapse
- VERSION =
"0.1.2"
Instance Attribute Summary collapse
-
#stopwords ⇒ Object
A list of stop words to ignore when matching names.
Instance Method Summary collapse
-
#extract_names(content) ⇒ Object
Extracts a list of names from a string.
-
#initialize ⇒ Matlock
constructor
Creates a new matlock object.
Constructor Details
#initialize ⇒ Matlock
Creates a new matlock object.
15 16 17 |
# File 'lib/matlock.rb', line 15 def initialize() @stopwords = [] end |
Instance Attribute Details
#stopwords ⇒ Object
A list of stop words to ignore when matching names.
27 28 29 |
# File 'lib/matlock.rb', line 27 def stopwords @stopwords end |
Instance Method Details
#extract_names(content) ⇒ Object
Extracts a list of names from a string.
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/matlock.rb', line 46 def extract_names(content) names = [] # Split content into words. words = content.split(/[^-_a-z0-9]+/i).select {|v| v.index(/^[-a-z]+$/i)} # Loop over each bigram and check if the words are title cased and if at # least one of the words is a first or last name. words.each_with_index do |first_name, index| surname = full_surname = words[index+1] || '' # Skip to the next word if we have a couple of the next words. if ['van', 'von'].index(surname) surname = words[index+2] || '' full_surname = "#{full_surname} #{surname}" end # Only look at two words that are titlecase and neither one is a stopword. next if !first_name.titlecase? || !surname.titlecase? next if !stopwords.index(first_name.upcase).nil? || !stopwords.index(surname.upcase).nil? # Check if either the first name or last name is a recognized common name. if Matlock::Data.first_name?(first_name) || Matlock::Data.surname?(surname) full_name = "#{first_name} #{full_surname}" names << full_name if names.index(full_name).nil? end end return names end |