Class: TextCleaner
- Inherits:
-
Object
- Object
- TextCleaner
- Defined in:
- lib/text_cleaner.rb
Constant Summary collapse
- @@whitespaces_regexp =
Regexp.new( "(\u00A0|\u1680|\u180E|[\u2000-\u200A]|\u2028|\u2029|\u202F|\u205F|\u3000)", Regexp::IGNORECASE, ).freeze
Class Method Summary collapse
- .capitalize(text, opts) ⇒ Object
- .clean(text, opts = {}) ⇒ Object
- .clean_title(title) ⇒ Object
- .downcase(text, opts) ⇒ Object
- .normalize_whitespaces(text) ⇒ Object
- .title_options ⇒ Object
- .upcase(text, opts) ⇒ Object
Class Method Details
.capitalize(text, opts) ⇒ Object
93 94 95 |
# File 'lib/text_cleaner.rb', line 93 def self.capitalize(text, opts) opts[:case_option] ? text.capitalize(opts[:case_option]) : text.capitalize end |
.clean(text, opts = {}) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/text_cleaner.rb', line 32 def self.clean(text, opts = {}) text = text.dup # Remove invalid byte sequences text.scrub!("") # Replace !!!!! with a single ! text.gsub!(/!+/, "!") if opts[:deduplicate_exclamation_marks] # Replace ????? with a single ? text.gsub!(/\?+/, "?") if opts[:deduplicate_question_marks] # Replace all-caps text with regular case letters text = downcase(text.mb_chars, opts).to_s if opts[:replace_all_upper_case] && (text == upcase(text.mb_chars, opts)) # Capitalize first letter, but only when entire first word is lowercase first, rest = text.split(" ", 2) if first && opts[:capitalize_first_letter] && first == downcase(first.mb_chars, opts) text = +"#{capitalize(first.mb_chars, opts)}#{rest ? " " + rest : ""}" end # Remove unnecessary periods at the end text.sub!(/([^.])\.+(\s*)\z/, '\1\2') if opts[:remove_all_periods_from_the_end] # Remove extraneous space before the end punctuation text.sub!(/\s+([!?]\s*)\z/, '\1') if opts[:remove_extraneous_space] # Fixes interior spaces text.gsub!(/ +/, " ") if opts[:fixes_interior_spaces] # Normalize whitespaces text = normalize_whitespaces(text) # Strip whitespaces text.strip! if opts[:strip_whitespaces] # Strip zero width spaces text.gsub!(/\u200b/, "") if opts[:strip_zero_width_spaces] text end |
.clean_title(title) ⇒ Object
28 29 30 |
# File 'lib/text_cleaner.rb', line 28 def self.clean_title(title) TextCleaner.clean(title, TextCleaner.) end |
.downcase(text, opts) ⇒ Object
85 86 87 |
# File 'lib/text_cleaner.rb', line 85 def self.downcase(text, opts) opts[:case_option] ? text.downcase(opts[:case_option]) : text.downcase end |
.normalize_whitespaces(text) ⇒ Object
81 82 83 |
# File 'lib/text_cleaner.rb', line 81 def self.normalize_whitespaces(text) text&.gsub(@@whitespaces_regexp, " ") end |
.title_options ⇒ Object
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/text_cleaner.rb', line 11 def self. # cf. http://meta.discourse.org/t/should-we-have-auto-replace-rules-in-titles/5687 { deduplicate_exclamation_marks: SiteSetting.title_prettify, deduplicate_question_marks: SiteSetting.title_prettify, replace_all_upper_case: SiteSetting.title_prettify && !SiteSetting.allow_uppercase_posts, capitalize_first_letter: SiteSetting.title_prettify, remove_all_periods_from_the_end: SiteSetting.title_prettify, remove_extraneous_space: SiteSetting.title_prettify && SiteSetting.title_remove_extraneous_space, fixes_interior_spaces: true, strip_whitespaces: true, strip_zero_width_spaces: true, case_option: SiteSetting.default_locale == "tr_TR" ? :turkic : nil, } end |
.upcase(text, opts) ⇒ Object
89 90 91 |
# File 'lib/text_cleaner.rb', line 89 def self.upcase(text, opts) opts[:case_option] ? text.upcase(opts[:case_option]) : text.upcase end |