Class: BANormalizedSearch::Search

Inherits:
Object
  • Object
show all
Defined in:
lib/ba-normalizedsearch.rb

Instance Method Summary collapse

Constructor Details

#initialize(wordlist = []) ⇒ Search

Returns a new instance of Search.



9
10
11
12
13
14
15
16
17
18
19
# File 'lib/ba-normalizedsearch.rb', line 9

def initialize wordlist=[]
  @wordlist = wordlist
            # tatweel  fathatan  dammatan  kasratan  fatha     damma     kasra     shadda    sukun
  @vocals = ["\u0640", "\u064b", "\u064c", "\u064d", "\u064e", "\u064f", "\u0650", "\u0651", "\u0652"]
  @re_vocals_delete = "[" + @vocals.join("") + "]"
  @waw = "و"
  @abd = "عبد"
  @din = "الدين"
  @al = "ال"
  @lah = "له"
end

Instance Method Details

#normalize(s) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/ba-normalizedsearch.rb', line 21

def normalize s
  s = s.unicode_normalize(:nfc)
  s = s.gsub(/#{@re_vocals_delete}/ , "") # delete vowels
  s = s.gsub(/[إأٱآ]/ , "ا") # normalise alif
  s = s.gsub(/[\u0676]/ , "\u0624") # ARABIC LETTER HIGH HAMZA WAW --> ARABIC LETTER WAW WITH HAMZA ABOVE
  s = s.gsub(/[\u0649\u06CC\u064a]/ , "\u064a") # normalise alif maqsura, farsi yeh, (ARABIC LETTER YEH WITH HAMZA ABOVE not yet! \u0626, see rules below), yeh --> yeh
  s = s.gsub(/(?<=\b)#{@al}(?!#{@lah})/ , "") # delete "al") after space (could in some cases be part of the word itself  --> false positives)
  s = s.gsub(/(?<=\b)#{@waw}\s/ , "#{@waw}") # Persian "wa XYZ" --> Arabic "waXYZ")
  s = s.gsub(/(?<=\b)#{@abd}\s/ , "#{@abd}") # standardise names with abd, e.g. abd allah (etc.) --> abdallah
  s = s.gsub(/(?<=\B)\u0626(?=\B)/ , "ي") # Arabic letter yeh with hamza above (in the middle of a word) --> Arabic letter yeh, e.g. عقائد --> عقايد
  s = s.gsub(/(?<=\B)\u0626(?=\b)/ , "يء") # Arabic letter yeh with hamza above (at the end of a word) --> yeh + Hamza, e.g. مبادئ --> مباديء

  # Ta' marbuta (U+0629) (Wortende) im Persischen mit Hah (U+0647) wiedergegeben, z.B. حاشية (ar) vs. حاشيه (fa)
  # ta' marbuta --> h? (Persian/Mar'ashi) 

  # arabisches Hamza auf Alif maqsura (U+0626) in pers. Onlinedaten als Yah+Hamza (U+0626) wiedergegeben, z.B. مبادئ (ar) vs. مباديء (fa)
  # Allerdings kommt das ئ (U+0626) im Mar'ashi-Katalog vor (قضائ, مفاتيحئ).

  # s = s.gsub(/\s#{din}(?=\b)/ , "#{din}") # standardise names with al-din, e.g. shihab aldin --> shihabaldin

  # Latin and romanised
  s = s.downcase
  s = s.gsub(/š/ , "sh")
  s = s.gsub(// , "kh")
  s = s.gsub(/ġ/ , "gh")
  s = s.gsub(// , "dh")
  s = s.gsub(// , "th")
  s = s.gsub(/á/ , "ā")

  # FRAGE THOMAS: ' kann Hamza ODER 'Ayn sein --> Mehrfachindizierung wie?
  #   Besser: alle Hamzas/Ayns u.ä. löschen 
  #   --> TE: auf einen Buchstaben abbilden
  # Frage Thomas: Ehrentitel als Präfix löschen? (außer bei kurzen Suchstrings?) --> vs. substring-Suche
  # Frage Thomas: Abdallah vs. Abd allah
  # Suche substring?
  
  return s
end

#search(searchword) ⇒ Object



60
61
62
63
64
# File 'lib/ba-normalizedsearch.rb', line 60

def search searchword
  @wordlist.select do |listword|
    normalize(listword) == normalize(searchword)
  end
end