Module: ChemScanner::Interpreter::PostProcess

Included in:
Scheme
Defined in:
lib/chem_scanner/interpreter/post_process/text_label.rb,
lib/chem_scanner/interpreter/post_process/reaction_info.rb,
lib/chem_scanner/interpreter/post_process/reagent_label.rb,
lib/chem_scanner/interpreter/post_process/text_as_molecule.rb,
lib/chem_scanner/interpreter/post_process/label_by_molecule.rb

Instance Method Summary collapse

Instance Method Details

#extract_product_yield(reaction) ⇒ Object



129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 129

def extract_product_yield(reaction)
  pyields = []

  reaction.products.each do |mol|
    next if mol.text.strip.empty?

    pyield = extract_yield_info(mol.text.strip)
    pyields.push(pyield)
  end

  pyields.join(";")
end

#extract_reaction_info(descs) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 106

def extract_reaction_info(descs)
  ryield = []
  temperatures = []
  times = []

  descs.each do |desc|
    dyield = extract_yield_info(desc)
    ryield.push(dyield) unless dyield.empty?

    temp = extract_temperature(desc)
    temperatures.push(temp) unless temp.empty?

    time = extract_time_info(desc)
    times.push(time) unless time.empty?
  end

  [
    temperatures.join(";"),
    ryield.join(";"),
    times.join(";"),
  ]
end

#extract_temperature(text) ⇒ Object



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 196

def extract_temperature(text)
  temp_regex_str = range_number_regex(DEGREE_REGEX, true)
  temperature_regex = %r{
    #{START_REGEX}
    #{temp_regex_str}
    #{ENDING_REGEX}
  }x
  temp = text_regex(text, temperature_regex)

  rt_regex = %r{
    #{START_REGEX}
    r\.?t\.?
    #{ENDING_REGEX}
  }xi
  m = text.match(rt_regex)
  return temp if m.nil? || m[0].empty?

  rt = "20°C ~ 25°C"
  temp.empty? ? rt : "#{temp}; #{rt}"
end

#extract_time_info(text) ⇒ Object



180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 180

def extract_time_info(text)
  time = []
  text.scan(time_duration_range_regex) { |m| time << m[0] }

  ovn_regex = "overnight|ovn|o/n"
  ovn_regex = %r{
    #{START_REGEX}
    (#{ovn_regex}?)
    #{ENDING_REGEX}
  }xi
  ovn = text_regex(text, ovn_regex)
  time.push("12h ~ 20h") unless ovn.empty?

  time.join(";")
end

#extract_yield_info(text) ⇒ Object



169
170
171
172
173
174
175
176
177
178
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 169

def extract_yield_info(text)
  yield_regex_str = range_number_regex("%", false)
  yield_regex = %r{
    #{START_REGEX}
    #{yield_regex_str}(?!\s*ee)
    #{ENDING_REGEX}
  }x

  text_regex(text, yield_regex)
end

#merge_chemdraw_with_predefined(mgroup, reaction) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 89

def merge_chemdraw_with_predefined(mgroup, reaction)
  mtext = mgroup.title.value
  abb_hash = name_to_struct(mtext)

  text_arr = split_text(mtext)
  text_arr.each_with_index do |text, idx|
    abb_smi = abb_hash[text]

    if abb_smi.nil?
      mol = mgroup.molecules[idx]
      reaction.reagents.push(mol) unless mol.nil?
    else
      reaction.reagent_smiles.push(abb_smi)
    end
  end
end

#name_to_struct(text) ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 58

def name_to_struct(text)
  smis = {}
  remain = []
  text_arr = split_text(text)

  text_arr.each do |t|
    smi = ChemScanner.get_abbreviation(t)

    if smi.empty?
      remain.push(t)
    else
      smis[t] = smi
    end
  end

  unless remain.empty?
    tmp = remain.join(" ")

    ChemScanner.all_abbreviations.keys.select do |key|
      key.include?(" ")
    end.each do |abb|
      next unless tmp.include?(abb)

      tmp.slice!(abb)
      smis[abb] = ChemScanner.get_abbreviation(abb)
    end
  end

  smis
end

#process_reaction_info(reaction) ⇒ Object



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 18

def process_reaction_info(reaction)
  descs = []
  reaction.text_ids.each do |tid|
    text_obj = @text_map[tid]
    text = text_obj.value
    descs.push(text)

    mgroup = @mol_group_map[tid]
    if mgroup.nil?
      abb_mol = name_to_struct(text)
      reaction.reagent_smiles.concat(abb_mol.values)
      reaction.reagent_abbs.concat(abb_mol.keys)
    else
      mtext = mgroup.title.value

      if mtext == text
        merge_chemdraw_with_predefined(mgroup, reaction)
      else
        descs.push(mtext)
        reaction.reagents.concat(mgroup.molecules)

        abb_mol = name_to_struct(mtext)
        reaction.reagent_smiles.concat(abb_mol.values)
      end
    end
  end

  temperature, ryield, time = extract_reaction_info(descs)
  pyield = extract_product_yield(reaction)

  reaction.temperature = temperature
  reaction.yield = pyield.empty? ? ryield : pyield
  reaction.time = time
  reaction.description = descs.reject { |e| e.to_s.empty? }.join("\n")
end

#range_number_regex(unit_regex, can_negative) ⇒ Object



142
143
144
145
146
147
148
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 142

def range_number_regex(unit_regex, can_negative)
  sign = can_negative ? "(-|−|–|—)?\\s*" : ""
  real_number = "(\\d+|\\d+\.\\d+)"

  "#{sign}(#{real_number}\\s*#{unit_regex}?\\s*" \
  "#{RANGE_REGEX})?#{real_number}\\s*#{unit_regex}"
end

#refine_reagents_labelObject



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/chem_scanner/interpreter/post_process/reagent_label.rb', line 8

def refine_reagents_label
  @reactions.each do |r|
    added_arr = []

    @arrow_map[r.arrow_id].text_arr.each do |tid|
      text = @text_map[tid]
      bold = text.bold_text
      next if bold.strip.empty?

      mol_id = r.reagent_ids.detect { |id| @mol_map[id].label == bold }
      next unless mol_id.nil?

      min_dist = { key: 0, value: 9_999_999 }
      r.reagent_ids.each do |rid|
        reagent = @mol_map[rid]
        dist = reagent.min_distance_to_point(text.polygon.center)
        min_dist = { key: rid, value: dist } if dist < min_dist[:value]
      end

      if min_dist[:key].positive?
        added_arr.push(text: tid, reagent: min_dist[:key])
      end
    end

    added_arr.each do |added|
      text = @text_map[added[:text]]
      r.text_ids.delete(text.id)
      @arrow_map[r.arrow_id].text_arr.delete(text.id)
      reagent = @mol_map[added[:reagent]]
      reagent.text_ids.push(text.id)
      assemble_molecule_text(reagent)
      # reagent.label = text.bold_text.strip
      # text.remove_bold
    end
  end
end

#refine_text_as_moleculeObject



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/chem_scanner/interpreter/post_process/text_as_molecule.rb', line 10

def refine_text_as_molecule
  key_to_delete = []

  @text_map.each do |k, text|
    mol = @mol_map.values.detect { |m| m.text_ids.include?(k) }
    next if mol.nil?

    smi = ChemScanner.get_abbreviation(text.value)
    next if smi.empty?

    group_pos = {}
    @reactions.each do |reaction|
      rid = reaction.arrow_id
      arrow = @arrow_map[rid]
      group = detect_position(arrow, text.polygon)
      next if group.nil?

      group_pos[rid] = group
    end

    pos = group_pos.detect { |_, p| p == "reagents" }
    next unless pos.nil?

    pos = group_pos.detect { |_, p| %w[reactants products].include?(p) }
    next if pos.nil?

    puts "group: #{group_pos}"
    key_to_delete.push(k)
    mol.text_ids.delete(k)
    @mol_map[k] = Molecule.new_from_smiles(k, smi)

    pos = group_pos.first
    reaction = @reactions.detect { |r| r.arrow_id == pos[0] }
    group_ids = reaction.send("#{pos[1][0...-1]}_ids")
    group_ids.push(k)
  end

  # Don't need to keep it text_map anymore
  key_to_delete.each { |k| @text_map.delete(k) }
end

#refine_text_labelObject

text_id could be both on text_map and mol_group_map Text-as-label, e.g. “ligand = ”, “amide = ”



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/chem_scanner/interpreter/post_process/text_label.rb', line 12

def refine_text_label
  @mol_map.select { |_, m| m.text.strip[-1] == "=" }.each do |mid, mol|
    label_text = mol.text.strip.chomp("=").strip
    existed = false

    @reactions.each do |r|
      @arrow_map[r.arrow_id].text_arr.each do |tid|
        text = @text_map[tid]
        next unless text.value.include?(label_text)

        existed = true
      end

      r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
    end

    next unless existed

    @reactions.each do |r|
      %w[reactant product].each do |group|
        group_ids = r.send("#{group}_ids")
        group_ids.delete(mid) if group_ids.include?(mid)
      end
    end
  end
end

#replace_label_by_moleculeObject



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/chem_scanner/interpreter/post_process/label_by_molecule.rb', line 8

def replace_label_by_molecule
  @reactions.each do |r|
    @arrow_map[r.arrow_id].text_arr.each do |tid|
      text = @text_map[tid]

      bolds = text.bold_text.strip.split(ABB_DELIM).reject(&:empty?)
      bolds.each do |bold|
        mol = @mol_map.detect { |_, m| m.label == bold }
        next if mol.nil?

        mid = mol[0]
        r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
      end

      non_bolds = text.non_bold_text.strip.split(ABB_DELIM)
      non_bolds.reject(&:empty?).each do |plain|
        next if plain.length < 3 || !(plain =~ /eq(uiv)?\.?/).nil?

        mol = @mol_map.detect { |_, m| m.text.strip == plain.strip }
        next if mol.nil?

        mid = mol[0]
        r.reagent_ids.push(mid) unless r.reagent_ids.include?(mid)
      end
    end
  end
end

#split_text(text) ⇒ Object



54
55
56
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 54

def split_text(text)
  text.split(ABB_DELIM).select { |t| t.length > 1 }
end

#text_regex(text, regex) ⇒ Object



217
218
219
220
221
222
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 217

def text_regex(text, regex)
  m = text.match(regex)
  return "" if m.nil?

  m[0].strip
end

#time_duration_range_regexObject



150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/chem_scanner/interpreter/post_process/reaction_info.rb', line 150

def time_duration_range_regex
  day = "days?|dy|d"
  hour = "hours?|hrs?|h"
  minute = "minutes?|mins?|m"
  second = "seconds?|secs?|s"
  real_number = '(\d+|\d+\.\d+)'

  time_unit = "(#{day}|#{hour}|#{minute}|#{second})"
  time_regex = "#{real_number}\\s*#{time_unit}"
  join_words = JOIN_WORDS.join("|")
  linker_regex = "(#{RANGE_REGEX}|(#{join_words}))"

  %r{
    #{START_REGEX}
    (#{time_regex}?\s*(#{linker_regex}\s*)?(#{real_number}\s*#{time_unit}))
    #{ENDING_REGEX}
  }x
end