Module: ChemScanner::Interpreter::BoldGroup
- Included in:
- MoleculeTextGroup, ReactionTextGroup
- Defined in:
- lib/chem_scanner/interpreter/text_group/bold_groups.rb
Instance Method Summary collapse
- #extract_alphabet_number(text) ⇒ Object
- #extract_range_number(text) ⇒ Object
- #group_combinations(rgroup) ⇒ Object
- #line_bold_groups(line, target_groups) ⇒ Object
- #normalize_bold(bold) ⇒ Object
- #normalize_bold_groups(bolds, groups) ⇒ Object
- #text_bold_groups(tid) ⇒ Object
Instance Method Details
#extract_alphabet_number(text) ⇒ Object
128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 128 def extract_alphabet_number(text) # 3a,b -> 3a, 3b regex = /(\d+)( *[a-z],*)+/ res = regex =~ text return [] if res.nil? els = text.split(",") anchor = els.first.strip.scan(/\d+/).first els[0] = els[0].gsub(anchor, "") els.reduce([]) do |arr, char| arr.push(anchor + char.strip) end end |
#extract_range_number(text) ⇒ Object
116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 116 def extract_range_number(text) # 3-6 -> 3,4,5,6 regex = /(\d+)-(\d+)/ res = text.scan(regex) return [] if res.empty? bnum, enum = res.first return [] if bnum >= enum (bnum..enum).to_a end |
#group_combinations(rgroup) ⇒ Object
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 143 def group_combinations(rgroup) return [] if rgroup.empty? combis = [] key_arr = rgroup.keys group_num = key_arr.size # keep track of next element in each of the R-groups substitutions indices = Array.new(group_num, 0) loop do combi = {} indices.each_with_index.each do |val, idx| group = key_arr[idx] substitute = rgroup[group][val] combi[group] = substitute end combis.push(combi) group_max_idx = group_num - 1 # rightmost array, has more elements left after the current element next_group_idx = group_max_idx next_group_size = rgroup[key_arr[next_group_idx]].size while next_group_idx >= 0 && (indices[next_group_idx] + 1 >= next_group_size) next_group_idx -= 1 next_group_size = rgroup[key_arr[next_group_idx]].size end return combis if next_group_idx < 0 indices[next_group_idx] += 1 (next_group_idx + 1..group_max_idx).each { |x| indices[x] = 0 } end end |
#line_bold_groups(line, target_groups) ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 43 def line_bold_groups(line, target_groups) bold_regex = /\*\*([^\*\*]*)\*\*/ bold = line.scan(bold_regex).map(&:first).map(&:strip) bold.reject! do |x| bold.select { |y| (y.size > x.size) && y.include?(x) }.count > 0 end group_or = "(" + target_groups.join("|") + ")" group_regex = /#{group_or} *=/ res = line.enum_for(:scan, group_regex) positions = res.map { Regexp.last_match.begin(0) } text_arr = positions.map.with_index do |pos, idx| end_pos = idx == (positions.size - 1) ? line.size : positions[idx + 1] rtext = line[pos, end_pos - pos] regex = /#{group_or} *= *([^\*\*])*(?=$|\n|\.|\z|\Z|\*\*)/ rtext[regex].strip end groups = text_arr.reduce({}) do |acc, gtext| group_val = gtext.scan(/#{group_or}? ?(?==)/) temp = gtext.split("=", 2).last.split(",").map do |t| t.strip.gsub(/^-/, "") end substitutes = temp.compact.uniq.select do |text| is_superatom = !ChemScanner.get_superatom(text).empty? is_abb = !ChemScanner.get_abbreviation(text).empty? is_n_atom = /^\d+$/.match?(text) is_superatom || is_abb || is_n_atom end next acc if group_val.empty? || substitutes.empty? info = { group_val.first.first.strip => substitutes } acc.merge(info) { |_, cur, new| cur.concat(new).compact.uniq } end [bold, groups] end |
#normalize_bold(bold) ⇒ Object
102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 102 def normalize_bold(bold) arr = [] # 1-3 or 5-10 ... => [1,2,3] or [5,6,7,8,9,10] range = extract_range_number(bold) return arr.concat(range) unless range.empty? # 1a,b or 8a,b,c ... => [1a, 1b] or [8a,8b,8c] range = extract_alphabet_number(bold) return arr.concat(range) unless range.empty? [bold] end |
#normalize_bold_groups(bolds, groups) ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 83 def normalize_bold_groups(bolds, groups) normalized = [] bolds.each_with_index do |bold, idx| bgroup = {} groups.each { |k, v| bgroup[k] = v[idx] unless v[idx].nil? } norm_bolds = bold.split(",").reduce([]) do |arr, b| nbold = normalize_bold(b.strip) arr.concat(nbold) end norm_bolds.each do |b| normalized.push(bold: b, group: bgroup) end end normalized end |
#text_bold_groups(tid) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 8 def text_bold_groups(tid) all_groups = @alias_info.values.reduce([]) do |arr, infos| groups = infos.map { |info| info[:group] } arr.concat(groups).uniq end # In case "** \n", splitting results # "[...**], [bold text** ...]" # The bold part will be missed for next line text = @text_map[tid].markdown.gsub(/ *\*\* *\n/, "\n**") lines = text.split("\n") bold_arr = [] group_info = {} lines.each do |line| bold_info, groups = line_bold_groups(line, all_groups) next if bold_info.empty? && groups.empty? list_bold = bold_info.reduce([]) do |arr, bold| # Remove ":" from "1:", or "2a,b:" bold.gsub!(":", "") norm = normalize_bold(bold) = norm.empty? ? bold : norm arr.push() end bold_list = list_bold.flatten.reject { |b| /^[a-z]+$/.match?(b) } bold_arr.concat(bold_list) group_info.merge!(groups) { |_, old, new| old.concat(new) } end [bold_arr, group_info] end |