Class: BANormalizedSearch::Search
- Inherits:
-
Object
- Object
- BANormalizedSearch::Search
- Defined in:
- lib/ba-normalizedsearch.rb
Instance Method Summary collapse
-
#initialize(wordlist = []) ⇒ Search
constructor
A new instance of Search.
- #normalize(s) ⇒ Object
- #search(searchword) ⇒ Object
Constructor Details
#initialize(wordlist = []) ⇒ Search
Returns a new instance of Search.
9 10 11 12 13 14 15 16 17 18 19 |
# File 'lib/ba-normalizedsearch.rb', line 9 def initialize wordlist=[] @wordlist = wordlist # tatweel fathatan dammatan kasratan fatha damma kasra shadda sukun @vocals = ["\u0640", "\u064b", "\u064c", "\u064d", "\u064e", "\u064f", "\u0650", "\u0651", "\u0652"] @re_vocals_delete = "[" + @vocals.join("") + "]" @waw = "و" @abd = "عبد" @din = "الدين" @al = "ال" @lah = "له" end |
Instance Method Details
#normalize(s) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
# File 'lib/ba-normalizedsearch.rb', line 21 def normalize s s = s.unicode_normalize(:nfc) s = s.gsub(/#{@re_vocals_delete}/ , "") # delete vowels s = s.gsub(/[إأٱآ]/ , "ا") # normalise alif s = s.gsub(/[\u0676]/ , "\u0624") # ARABIC LETTER HIGH HAMZA WAW --> ARABIC LETTER WAW WITH HAMZA ABOVE s = s.gsub(/[\u0649\u06CC\u064a]/ , "\u064a") # normalise alif maqsura, farsi yeh, (ARABIC LETTER YEH WITH HAMZA ABOVE not yet! \u0626, see rules below), yeh --> yeh s = s.gsub(/(?<=\b)#{@al}(?!#{@lah})/ , "") # delete "al") after space (could in some cases be part of the word itself --> false positives) s = s.gsub(/(?<=\b)#{@waw}\s/ , "#{@waw}") # Persian "wa XYZ" --> Arabic "waXYZ") s = s.gsub(/(?<=\b)#{@abd}\s/ , "#{@abd}") # standardise names with abd, e.g. abd allah (etc.) --> abdallah s = s.gsub(/(?<=\B)\u0626(?=\B)/ , "ي") # Arabic letter yeh with hamza above (in the middle of a word) --> Arabic letter yeh, e.g. عقائد --> عقايد s = s.gsub(/(?<=\B)\u0626(?=\b)/ , "يء") # Arabic letter yeh with hamza above (at the end of a word) --> yeh + Hamza, e.g. مبادئ --> مباديء # Ta' marbuta (U+0629) (Wortende) im Persischen mit Hah (U+0647) wiedergegeben, z.B. حاشية (ar) vs. حاشيه (fa) # ta' marbuta --> h? (Persian/Mar'ashi) # arabisches Hamza auf Alif maqsura (U+0626) in pers. Onlinedaten als Yah+Hamza (U+0626) wiedergegeben, z.B. مبادئ (ar) vs. مباديء (fa) # Allerdings kommt das ئ (U+0626) im Mar'ashi-Katalog vor (قضائ, مفاتيحئ). # s = s.gsub(/\s#{din}(?=\b)/ , "#{din}") # standardise names with al-din, e.g. shihab aldin --> shihabaldin # Latin and romanised s = s.downcase s = s.gsub(/š/ , "sh") s = s.gsub(/ḫ/ , "kh") s = s.gsub(/ġ/ , "gh") s = s.gsub(/ḏ/ , "dh") s = s.gsub(/ṯ/ , "th") s = s.gsub(/á/ , "ā") # FRAGE THOMAS: ' kann Hamza ODER 'Ayn sein --> Mehrfachindizierung wie? # Besser: alle Hamzas/Ayns u.ä. löschen # --> TE: auf einen Buchstaben abbilden # Frage Thomas: Ehrentitel als Präfix löschen? (außer bei kurzen Suchstrings?) --> vs. substring-Suche # Frage Thomas: Abdallah vs. Abd allah # Suche substring? return s end |
#search(searchword) ⇒ Object
60 61 62 63 64 |
# File 'lib/ba-normalizedsearch.rb', line 60 def search searchword @wordlist.select do |listword| normalize(listword) == normalize(searchword) end end |