Module: ChemScanner::Interpreter::PostProcess
- Included in:
- Scheme
- Defined in:
- lib/chem_scanner/interpreter/post_process/text_label.rb,
lib/chem_scanner/interpreter/post_process/reaction_info.rb,
lib/chem_scanner/interpreter/post_process/reagent_label.rb,
lib/chem_scanner/interpreter/post_process/text_as_molecule.rb,
lib/chem_scanner/interpreter/post_process/label_by_molecule.rb
Instance Method Summary collapse
- #extract_product_yield(reaction) ⇒ Object
- #extract_reaction_info(descs) ⇒ Object
- #extract_temperature(text) ⇒ Object
- #extract_time_info(text) ⇒ Object
- #extract_yield_info(text) ⇒ Object
- #merge_chemdraw_with_predefined(mgroup, reaction) ⇒ Object
- #name_to_struct(text) ⇒ Object
- #process_reaction_info(reaction) ⇒ Object
- #range_number_regex(unit_regex, can_negative) ⇒ Object
- #refine_reagents_label ⇒ Object
- #refine_text_as_molecule ⇒ Object
-
#refine_text_label ⇒ Object
text_id could be both on text_map and mol_group_map Text-as-label, e.g.
- #replace_label_by_molecule ⇒ Object
- #split_text(text) ⇒ Object
- #text_regex(text, regex) ⇒ Object
- #time_duration_range_regex ⇒ Object
Instance Method Details
#extract_product_yield(reaction) ⇒ Object
129 130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 129 def extract_product_yield(reaction) pyields = [] reaction.products.each do |mol| next if mol.text.strip.empty? pyield = extract_yield_info(mol.text.strip) pyields.push(pyield) end pyields.join(";") end |
#extract_reaction_info(descs) ⇒ Object
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 106 def extract_reaction_info(descs) ryield = [] temperatures = [] times = [] descs.each do |desc| dyield = extract_yield_info(desc) ryield.push(dyield) unless dyield.empty? temp = extract_temperature(desc) temperatures.push(temp) unless temp.empty? time = extract_time_info(desc) times.push(time) unless time.empty? end [ temperatures.join(";"), ryield.join(";"), times.join(";"), ] end |
#extract_temperature(text) ⇒ Object
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 196 def extract_temperature(text) temp_regex_str = range_number_regex(DEGREE_REGEX, true) temperature_regex = %r{ #{START_REGEX} #{temp_regex_str} #{ENDING_REGEX} }x temp = text_regex(text, temperature_regex) rt_regex = %r{ #{START_REGEX} r\.?t\.? #{ENDING_REGEX} }xi m = text.match(rt_regex) return temp if m.nil? || m[0].empty? rt = "20°C ~ 25°C" temp.empty? ? rt : "#{temp}; #{rt}" end |
#extract_time_info(text) ⇒ Object
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 180 def extract_time_info(text) time = [] text.scan(time_duration_range_regex) { |m| time << m[0] } ovn_regex = "overnight|ovn|o/n" ovn_regex = %r{ #{START_REGEX} (#{ovn_regex}?) #{ENDING_REGEX} }xi ovn = text_regex(text, ovn_regex) time.push("12h ~ 20h") unless ovn.empty? time.join(";") end |
#extract_yield_info(text) ⇒ Object
169 170 171 172 173 174 175 176 177 178 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 169 def extract_yield_info(text) yield_regex_str = range_number_regex("%", false) yield_regex = %r{ #{START_REGEX} #{yield_regex_str}(?!\s*ee) #{ENDING_REGEX} }x text_regex(text, yield_regex) end |
#merge_chemdraw_with_predefined(mgroup, reaction) ⇒ Object
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 89 def merge_chemdraw_with_predefined(mgroup, reaction) mtext = mgroup.title.value abb_hash = name_to_struct(mtext) text_arr = split_text(mtext) text_arr.each_with_index do |text, idx| abb_smi = abb_hash[text] if abb_smi.nil? mol = mgroup.molecules[idx] reaction.reagents.push(mol) unless mol.nil? else reaction.reagent_smiles.push(abb_smi) end end end |
#name_to_struct(text) ⇒ Object
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 58 def name_to_struct(text) smis = {} remain = [] text_arr = split_text(text) text_arr.each do |t| smi = ChemScanner.get_abbreviation(t) if smi.empty? remain.push(t) else smis[t] = smi end end unless remain.empty? tmp = remain.join(" ") ChemScanner.all_abbreviations.keys.select do |key| key.include?(" ") end.each do |abb| next unless tmp.include?(abb) tmp.slice!(abb) smis[abb] = ChemScanner.get_abbreviation(abb) end end smis end |
#process_reaction_info(reaction) ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 18 def process_reaction_info(reaction) descs = [] reaction.text_ids.each do |tid| text_obj = @text_map[tid] text = text_obj.value descs.push(text) mgroup = @mol_group_map[tid] if mgroup.nil? abb_mol = name_to_struct(text) reaction.reagent_smiles.concat(abb_mol.values) reaction.reagent_abbs.concat(abb_mol.keys) else mtext = mgroup.title.value if mtext == text merge_chemdraw_with_predefined(mgroup, reaction) else descs.push(mtext) reaction.reagents.concat(mgroup.molecules) abb_mol = name_to_struct(mtext) reaction.reagent_smiles.concat(abb_mol.values) end end end temperature, ryield, time = extract_reaction_info(descs) pyield = extract_product_yield(reaction) reaction.temperature = temperature reaction.yield = pyield.empty? ? ryield : pyield reaction.time = time reaction.description = descs.reject { |e| e.to_s.empty? }.join("\n") end |
#range_number_regex(unit_regex, can_negative) ⇒ Object
142 143 144 145 146 147 148 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 142 def range_number_regex(unit_regex, can_negative) sign = can_negative ? "(-|−|–|—)?\\s*" : "" real_number = "(\\d+|\\d+\.\\d+)" "#{sign}(#{real_number}\\s*#{unit_regex}?\\s*" \ "#{RANGE_REGEX})?#{real_number}\\s*#{unit_regex}" end |
#refine_reagents_label ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/chem_scanner/interpreter/post_process/reagent_label.rb', line 8 def refine_reagents_label @reactions.each do |r| added_arr = [] @arrow_map[r.arrow_id].text_arr.each do |tid| text = @text_map[tid] bold = text.bold_text next if bold.strip.empty? mol_id = r.reagent_ids.detect { |id| @mol_map[id].label == bold } next unless mol_id.nil? min_dist = { key: 0, value: 9_999_999 } r.reagent_ids.each do |rid| reagent = @mol_map[rid] dist = reagent.min_distance_to_point(text.polygon.center) min_dist = { key: rid, value: dist } if dist < min_dist[:value] end if min_dist[:key].positive? added_arr.push(text: tid, reagent: min_dist[:key]) end end added_arr.each do |added| text = @text_map[added[:text]] r.text_ids.delete(text.id) @arrow_map[r.arrow_id].text_arr.delete(text.id) reagent = @mol_map[added[:reagent]] reagent.text_ids.push(text.id) assemble_molecule_text(reagent) # reagent.label = text.bold_text.strip # text.remove_bold end end end |
#refine_text_as_molecule ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/chem_scanner/interpreter/post_process/text_as_molecule.rb', line 10 def refine_text_as_molecule key_to_delete = [] @text_map.each do |k, text| mol = @mol_map.values.detect { |m| m.text_ids.include?(k) } next if mol.nil? smi = ChemScanner.get_abbreviation(text.value) next if smi.empty? group_pos = {} @reactions.each do |reaction| rid = reaction.arrow_id arrow = @arrow_map[rid] group = detect_position(arrow, text.polygon) next if group.nil? group_pos[rid] = group end pos = group_pos.detect { |_, p| p == "reagents" } next unless pos.nil? pos = group_pos.detect { |_, p| %w[reactants products].include?(p) } next if pos.nil? puts "group: #{group_pos}" key_to_delete.push(k) mol.text_ids.delete(k) @mol_map[k] = Molecule.new_from_smiles(k, smi) pos = group_pos.first reaction = @reactions.detect { |r| r.arrow_id == pos[0] } group_ids = reaction.send("#{pos[1][0...-1]}_ids") group_ids.push(k) end # Don't need to keep it text_map anymore key_to_delete.each { |k| @text_map.delete(k) } end |
#refine_text_label ⇒ Object
text_id could be both on text_map and mol_group_map Text-as-label, e.g. “ligand = ”, “amide = ”
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
# File 'lib/chem_scanner/interpreter/post_process/text_label.rb', line 12 def refine_text_label @mol_map.select { |_, m| m.text.strip[-1] == "=" }.each do |mid, mol| label_text = mol.text.strip.chomp("=").strip existed = false @reactions.each do |r| @arrow_map[r.arrow_id].text_arr.each do |tid| text = @text_map[tid] next unless text.value.include?(label_text) existed = true end r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid) end next unless existed @reactions.each do |r| %w[reactant product].each do |group| group_ids = r.send("#{group}_ids") group_ids.delete(mid) if group_ids.include?(mid) end end end end |
#replace_label_by_molecule ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/chem_scanner/interpreter/post_process/label_by_molecule.rb', line 8 def replace_label_by_molecule @reactions.each do |r| @arrow_map[r.arrow_id].text_arr.each do |tid| text = @text_map[tid] bolds = text.bold_text.strip.split(ABB_DELIM).reject(&:empty?) bolds.each do |bold| mol = @mol_map.detect { |_, m| m.label == bold } next if mol.nil? mid = mol[0] r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid) end non_bolds = text.non_bold_text.strip.split(ABB_DELIM) non_bolds.reject(&:empty?).each do |plain| next if plain.length < 3 || !(plain =~ /eq(uiv)?\.?/).nil? mol = @mol_map.detect { |_, m| m.text.strip == plain.strip } next if mol.nil? mid = mol[0] r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid) end end end end |
#split_text(text) ⇒ Object
54 55 56 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 54 def split_text(text) text.split(ABB_DELIM).select { |t| t.length > 1 } end |
#text_regex(text, regex) ⇒ Object
217 218 219 220 221 222 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 217 def text_regex(text, regex) m = text.match(regex) return "" if m.nil? m[0].strip end |
#time_duration_range_regex ⇒ Object
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 150 def time_duration_range_regex day = "days?|dy|d" hour = "hours?|hrs?|h" minute = "minutes?|mins?|m" second = "seconds?|secs?|s" real_number = '(\d+|\d+\.\d+)' time_unit = "(#{day}|#{hour}|#{minute}|#{second})" time_regex = "#{real_number}\\s*#{time_unit}" join_words = JOIN_WORDS.join("|") linker_regex = "(#{RANGE_REGEX}|(#{join_words}))" %r{ #{START_REGEX} (#{time_regex}?\s*(#{linker_regex}\s*)?(#{real_number}\s*#{time_unit})) #{ENDING_REGEX} }x end |