- DATA_FILE =
File.expand_path("lib/gender_machine/nam_dict.txt")
- RAW_CONTENT =
open(DATA_FILE, "r:ISO-8859-1:UTF-8")
- FILE_CONTENT =
RAW_CONTENT.read
- SECTION_DIVIDER =
"#" * 80
- SECTIONS =
FILE_CONTENT.split(/begin of name list/)
.first
.split(/^#{SECTION_DIVIDER}/)
- DATA_LICENSE =
SECTIONS[1]
- ENCODING_INFO =
SECTIONS[2]
- SYNTAX_EXPLANATION =
SECTIONS[4]
- COUNTRIES =
SECTIONS[6].split("\n")[6..-19]
- COUNTRY_MAP =
COUNTRIES.each_slice(3)
.with_object({}) do |data, obj|
name, ind, _ = data
country = name.gsub(/[#\$]/, '')
.strip
.downcase
.gsub(/[^a-z]/, '_')
.chomp("_")
.to_sym
index = ind.index('|')
obj[country] = index
end
- COUNTRY_MAP_NUMERIC =
COUNTRY_MAP.invert
- CONTENT =
FILE_CONTENT.split("\n").map { |r| r.chomp("$") }
- NAME_ROWS =
CONTENT.reject { |i| i.start_with?("#") }
.map do |row|
gender, name, _ = row.split(/\s+/, 3)
rest = row.split(/\s/, 4).last
[gender, name, rest, row]
end
- NAMES =
NAME_ROWS.map do |row|
gender, name, rest, raw = *row
Name.new(gender: gender,
name: name,
frequency_string: rest,
raw: raw)
end