Module: ChemScanner::Interpreter::ReactionDetection

Includes:
SchemeBase
Included in:
Scheme
Defined in:
lib/chem_scanner/interpreter/post_process/reaction_step.rb,
lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb,
lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb,
lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb,
lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb,
lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb,
lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb

Instance Method Summary collapse

Methods included from SchemeBase

#add_molecule_substitution_info, #add_reaction_substitution_info, #assemble_molecule_text, #auto_fit_arrow_polygons

Instance Method Details

#assign_molecule_groupObject



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/chem_scanner/interpreter/reaction_detection/molecule_group.rb', line 10

def assign_molecule_group
  all_reagent_ids = @reactions.reduce([]) do |acc, r|
    acc.concat(@arrow_map[r.arrow_id].text_arr)
  end

  auto_fit_arrow_polygons

  @mol_group_map.select do |tid, mgroup|
    (
      !all_reagent_ids.include?(tid) &&
      mgroup.molecules.count == 1 &&
      !mgroup.molecules.first.boxed
    )
  end.each do |mkey, mgroup|
    mol = mgroup.molecules.first
    mmid = mol.fragment.id

    mgroup_pos = {}
    @reactions.each do |reaction|
      rid = reaction.arrow_id
      arrow = @arrow_map[rid]
      group = detect_position(arrow, mgroup.title.polygon)
      next if group.nil?

      mgroup_pos[rid] = group
    end

    pos = mgroup_pos.detect { |_, p| p == "reagents" }
    next unless pos.nil?

    pos = mgroup_pos.detect { |_, p| %w[reactants products].include?(p) }
    next if pos.nil?

    # Don't need to keep it text_map anymore
    mol.text = @text_map.delete(mkey).value unless mgroup_pos.empty?
    mol.text_ids.delete(mkey)
    @mol_map.each_value { |m| m.text_ids.delete(mkey) }

    reaction = @reactions.detect { |r| r.arrow_id == pos[0] }
    group_ids = reaction.send("#{pos[1][0...-1]}_ids")
    group_ids.push(mmid)
  end
end

#assign_textObject

Attach/bind text to molecule or arrow



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# File 'lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb', line 9

def assign_text
  tgroup_ids = @mol_group_map.keys
  text_as_mol_ids = []

  @text_map.each do |k, text|
    group = try_detect_label_position(text)
    center = text.polygon.center

    min_mol = nearest_molecule(center)
    min_arrow = nearest_arrow(text)
    arrow = @arrow_map[min_arrow.key]

    if arrow.nil?
      mol_key = min_mol.key

      if group.nil?
        @mol_map[mol_key].text_ids.push(k)
      else
        text_as_mol_ids.push(id: k, mol: mol_key, group: group)
      end

      next
    end

    if min_mol.key.zero?
      arrow.text_arr.push(min_arrow.key)
      next
    end

    to_arrow = (
      min_arrow.value < min_mol.value * 2.5 &&
      text_around_arrow?(arrow, text, min_arrow.value)
    )

    if to_arrow
      arrow.text_arr.push(k)
      next
    end

    # Do not add a molecule-group text to molecule as description
    @mol_map[min_mol.key].text_ids.push(k) unless tgroup_ids.include?(k)
  end

  text_as_mol_ids.each do |tinfo|
    tid = tinfo[:id]
    text = @text_map[tid]
    mid = tinfo[:mol]
    mol = @mol_map.values.detect { |m| m.label == text.bold_text }

    if mol.nil?
      @mol_map[mid].text_ids.push(tid)
    else
      rid = tinfo[:group].keys.first
      group = tinfo[:group][rid]
      reaction = @reactions.detect { |r| r.arrow_id == rid }
      rgroup = reaction.send("#{group[0..-2]}_ids")
      rgroup.push(mol.id).uniq!
    end
  end

  @mol_map.each_value { |mol| assemble_molecule_text(mol) }
end

#assign_to_reactionObject



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb', line 12

def assign_to_reaction
  undetected_molecules = {}

  @arrow_map.each do |key, arrow|
    reaction = Reaction.new
    reaction.arrow_id = key
    undetected = []

    @mol_map.reject { |_, mol| mol.boxed }.each do |kmol, mol|
      mpoly = mol.polygon

      @arrow_map.each_value do |a|
        dist = a.min_distance_to_polygon(mpoly)
        a.build_polygons(mpoly.height + dist)
      end

      group = detect_position(arrow, mpoly)

      case group
      when "reagents" then reaction.reagent_ids.push(kmol)
      when "reactants" then reaction.reactant_ids.push(kmol)
      when "products" then reaction.product_ids.push(kmol)
      else undetected.push(kmol)
      end
    end

    @reactions.push(reaction)
    undetected_molecules[key] = undetected unless undetected.empty?
  end

  # Molecules which are both reagents and reactants/products
  # If reagent -> arrow distance in range, then consider as reagent
  # Otherwise, consider as reactant/product
  @reactions.each do |r|
    reagent_ids = r.reagent_ids
    arrow = @arrow_map[r.arrow_id]

    others = @reactions.reject { |oreact| oreact.arrow_id == r.arrow_id }
    others.each do |o|
      common = reagent_ids & o.reactant_ids
      common += reagent_ids & o.product_ids
      common.each do |cid|
        mol = @mol_map[cid]
        dist = arrow.min_distance_to_polygon(mol.polygon)
        target = dist > 2 ? r : o
        target.delete_id(cid)
      end
    end
  end

  auto_fit_arrow_polygons

  undetected_molecules.each do |rkey, ids|
    reaction = @reactions.detect { |r| r.arrow_id == rkey }
    arrow = @arrow_map[rkey]

    ids.each do |id|
      mol = @mol_map[id]
      mpoly = mol.polygon
      group = detect_position(arrow, mpoly)

      case group
      when "reagents" then reaction.reagent_ids.push(id)
      when "reactants" then reaction.reactant_ids.push(id)
      when "products" then reaction.product_ids.push(id)
      end
    end
  end
end

#check_position(mol_poly, arrow, prod_side = true) ⇒ Object

Check if molecule belong to reaction



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb', line 97

def check_position(mol_poly, arrow, prod_side = true)
  arrow_segment = ->(larrow) do
    prod_side ? larrow.head_segment : larrow.tail_segment
  end

  segment = arrow_segment.call(arrow)
  sline = segment.to_line
  inter = sline.intersects_with_polygon?(mol_poly)
  return false unless inter

  inter_point = sline.intersection_points_with_polygon(mol_poly).first
  inter_seg = Geometry::Segment.new(segment.point2, inter_point)

  @arrow_map.except(arrow.id).each_value do |oarrow|
    other_hseg = oarrow.head_segment
    check_contains = (
      other_hseg.contains_segment?(segment) ||
      segment.contains_segment?(other_hseg)
    )
    next if check_contains

    osegment = arrow_segment.call(oarrow)
    check = osegment.to_line.intersects_with_polygon?(mol_poly) && \
      oarrow.all_intersects_with_segment?(inter_seg)

    return false if check
  end

  true
end

#check_reaction_orderringObject



48
49
50
51
52
53
54
55
56
57
# File 'lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb', line 48

def check_reaction_orderring
  return true if @arrow_map.count < 2

  @arrow_map.each_value do |arrow|
    return true if arrow.middle_points.count > 0
    return true unless arrow.head_segment.to_line.horizontal?
  end

  false
end

#detect_position(arrow, mol_poly) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
# File 'lib/chem_scanner/interpreter/reaction_detection/assign_to_reaction.rb', line 82

def detect_position(arrow, mol_poly)
  mcenter = mol_poly.center

  check_pos = check_position(mol_poly, arrow)
  return "products" if check_pos && arrow.product_side?(mcenter)

  check_pos = check_position(mol_poly, arrow, false)
  return "reactants" if check_pos && arrow.reactant_side?(mcenter)

  return "reagents" if arrow.polygon_around?(mol_poly)

  nil
end

#detect_reaction_step(reaction) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/chem_scanner/interpreter/post_process/reaction_step.rb', line 12

def detect_reaction_step(reaction)
  number_ref = [
    ["1", "2", "3", "4", "5", "6", "7", "8", "9"],
    ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX"],
    ["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix"],
    ["A", "B", "C", "D", "E", "F", "G", "H", "J"],
  ]

  regex_list = [
    /(^|\A)(([1-9a-z]{0,3}) *[)\.] *(.*))($|\z)/i,
    /(^|\A)\((([1-9a-z]{0,3}) *\) *(.*))($|\z)/i,
  ]
  check = false

  list_matched = []
  list_numbered = []
  regex_list.each do |regex|
    next if check

    list_matched = reaction.description.enum_for(:scan, regex).map {
      Regexp.last_match
    }
    list_numbered = list_matched.map { |x| x[3] }
    next if list_numbered.empty?

    number_ref.each do |ref|
      check = true if ref & list_numbered == list_numbered
    end
  end

  return unless check && list_numbered.count >= 2

  flatten_ref = number_ref.flatten
  check_temperature = false
  check_time = false
  list_position = list_matched.map { |x| x.begin(0) }

  list_matched.each_with_index.map do |matched, idx|
    next_pos = list_position[idx + 1] || -1
    next_pos = next_pos.negative? ? next_pos : (next_pos - 1)
    description = reaction.description[list_position[idx]..next_pos]
    text_start_pos = if matched[4].empty?
                       m2 = matched[2]
                       description.index(m2) + m2.size
                     else
                       description.index(matched[4]) || 0
                     end
    description = description[text_start_pos..-1]
    temperature, _, time = extract_reaction_info([description])

    step = ReactionStep.new
    step.temperature = temperature
    step.time = time
    step.description = description
    step.number = (flatten_ref.index(matched[3]) % 9) + 1

    check_time = !time.empty?
    check_temperature = !temperature.empty?

    reaction.reagent_abbs.each do |abb|
      next unless description.include?(abb)

      step.reagents.push(ChemScanner.get_abbreviation(abb))
    end

    reaction.steps.push(step)
  end

  reaction.time = "" if check_time
  reaction.temperature = "" if check_temperature

  # NOTE: tempo tricky assign reagents to empty step
  return if reaction.reagents.count != 1

  empty_steps = reaction.steps.select do |s|
    s.description.empty? || s.description == "\n"
  end
  return if empty_steps.count != 1

  empty_steps.first.reagents.push(reaction.reagents.first.cano_smiles)
end

#distance_molecule_group(rgroup, arrow, group) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb', line 69

def distance_molecule_group(rgroup, arrow, group)
  dist_map = {}
  intersect_points_with_line = ->(id, line) do
    @mol_map[id].polygon.intersection_points_with_line(line)
  end

  if group == "reactant_ids"
    apoint = arrow.tail
    aline = arrow.tail_segment.to_line
  else
    apoint = arrow.head
    aline = arrow.head_segment.to_line
  end

  rgroup.each do |id|
    next unless @mol_map.key?(id)

    # Distance to arrow
    inter_points = intersect_points_with_line.call(id, aline)
    da = 9_999_999
    inter_points.each do |point|
      length = Geometry.distance(apoint, point)
      da = length if length < da
    end

    # Distance to other molecule within group
    dmols = 9_999_999
    (rgroup - [id]).each do |mid|
      other = @mol_map[mid]
      next if other.nil?

      intersect_points_with_line.call(mid, aline).each do |op|
        inter_points.each do |p|
          length = Geometry.distance(p, op)
          dmols = length if length < dmols
        end
      end
    end

    dist_map[id] = [da, dmols].min
  end

  dist_map
end

#molecules_intersects_with_segment(segment) ⇒ Object



156
157
158
159
160
161
162
163
# File 'lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb', line 156

def molecules_intersects_with_segment(segment)
  ids = []
  @mol_map.each do |key, mol|
    ids.push(key) if segment.intersects_with_polygon?(mol.polygon)
  end

  ids
end

#multi_line_chain_reactionObject



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb', line 8

def multi_line_chain_reaction
  return if check_reaction_orderring

  rarray = @reactions.select do |r|
    r.reactant_ids.count.zero? || r.product_ids.count.zero?
  end

  rcount = rarray.count
  return if rcount.zero?

  auto_fit_arrow_polygons

  sorted_akey = sort_arrow_map

  get_reaction = ->(id) { @reactions.detect { |r| r.arrow_id == id } }

  rarray.each do |reaction|
    rkey = sorted_akey.find_index do |key_arr|
      key_arr.include?(reaction.arrow_id)
    end
    next if rkey.nil?

    if reaction.reactant_ids.count.zero?
      other_ids = sorted_akey[rkey - 1]
      next if other_ids.nil?

      other_id = other_ids.last
      other = get_reaction.call(other_id)
      reaction.reactant_ids.concat(other.product_ids)
    else
      other_ids = sorted_akey[rkey + 1]
      next if other_ids.nil?

      other_id = other_ids.first
      other = get_reaction.call(other_id)
      reaction.product_ids.concat(other.reactant_ids)
    end
  end
end

#nearest_arrow(text) ⇒ Object



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb', line 108

def nearest_arrow(text)
  min_arrow = OpenStruct.new(key: 0, value: 9_999_999)
  tpoly = text.polygon

  @arrow_map.each do |okey, arrow|
    arrow.segments.each do |segment|
      ppoint = segment.to_line.point_projection(tpoly.center)
      seg_contains = segment.contains_point?(ppoint)
      next unless seg_contains

      dist = segment.distance_to_boundingbox(tpoly)

      if dist < min_arrow.value
        min_arrow.key = okey
        min_arrow.value = dist
      end
    end
  end

  min_arrow
end

#nearest_molecule(point) ⇒ Object



93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb', line 93

def nearest_molecule(point)
  min_mol = OpenStruct.new(key: 0, value: 9_999_999)

  @mol_map.each do |okey, mol|
    dist = mol.min_distance_to_point(point)

    if dist < min_mol.value
      min_mol.key = okey
      min_mol.value = dist
    end
  end

  min_mol
end

#process_reactions_stepObject



8
9
10
# File 'lib/chem_scanner/interpreter/post_process/reaction_step.rb', line 8

def process_reactions_step
  @reactions.each { |r| detect_reaction_step(r) }
end

#refine_duplicate_reagentsObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/chem_scanner/interpreter/reaction_detection/duplicate_reagents.rb', line 9

def refine_duplicate_reagents
  delete_info = []

  @reactions.each do |r|
    arrow = @arrow_map[r.arrow_id]
    rremain = @reactions.reject { |other| other.arrow_id == r.arrow_id }

    rremain.each do |other|
      other_rps = other.reactant_ids + other.product_ids
      r.reagent_ids -= other_rps

      dup_ids = r.reagent_ids & other.reagent_ids
      next if dup_ids.empty?

      dup_ids.each do |id|
        obj = @mol_map.key?(id) ? @mol_map[id] : @text_map[id]

        polygon = obj.polygon
        pcenter = polygon.center
        apoint = arrow.contains_point?(pcenter)
        opoint = @arrow_map[other.arrow_id].contains_point?(pcenter)
        next if apoint.nil? || opoint.nil?

        rdist = pcenter.distance_to(apoint)
        odist = pcenter.distance_to(opoint)

        if rdist > odist
          info = OpenStruct.new(rid: r.arrow_id, id: id)
          delete_info.push(info)
        end
      end
    end
  end

  delete_info.each do |info|
    reaction = @reactions.detect { |r| r.arrow_id == info.rid }
    reaction.delete_id(info.id)
  end
end

#remove_separated_molObject

(1): A —> C

(2): B —> D

|
|
V
E

Remove C from (2)



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/chem_scanner/interpreter/reaction_detection/remove_separated_mol.rb', line 16

def remove_separated_mol
  dist_gap = 2.0

  @reactions.each do |r|
    arrow = @arrow_map[r.arrow_id]

    %w[reactant_ids product_ids].each do |group|
      rgroup = r.send(group)
      next if rgroup.count < 2

      # Distance map of 1 molecule to arrow
      #   and other molecules within group
      dist_map = distance_molecule_group(rgroup, arrow, group)
      min_dist = dist_map.min_by { |_, value| value }.last

      remove_map = dist_map.select do |k, v|
        dist_check = v > (dist_gap * min_dist)
        next unless dist_check

        in_other = @reactions.select do |other|
          check = (
            other.arrow_id != r.arrow_id &&
            other.molecule_ids.include?(k)
          )
          next unless check

          oarrow = @arrow_map[other.arrow_id]
          !arrow.parallel_to?(oarrow)
        end

        in_other.count > 0
      end
      remove_keys = remove_map.keys

      remove_map.each_key do |k|
        mol = @mol_map[k]
        next if mol.nil?

        (rgroup - [k]).each do |id|
          om = @mol_map[id]
          next if om.nil?

          d = Geometry.distance(mol.polygon.center, om.polygon.center)

          remove_keys.push(id) if d < (dist_gap * min_dist)
        end
      end

      rgroup.delete_if { |x| remove_keys.include?(x) }
    end
  end
end

#sort_arrow_mapObject



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/chem_scanner/interpreter/reaction_detection/multi_line_chain_reaction.rb', line 59

def sort_arrow_map
  sorted_arr = []
  arrow_keys = @arrow_map.keys

  while !arrow_keys.empty?
    arrow = @arrow_map[arrow_keys.first]
    aheight = arrow.height
    min_height = arrow.head.y - aheight
    max_height = arrow.head.y + aheight

    akeys = arrow_keys.select do |ak|
      y_head = @arrow_map[ak].head.y
      y_head >= min_height && y_head <= max_height
    end

    sorted_arr.push(akeys)
    arrow_keys = arrow_keys - akeys
  end

  sorted_arr.map! { |arr| arr.sort_by! { |id| @arrow_map[id].head.x } }
  sorted_arr.sort_by! { |arr| - @arrow_map[arr.first].head.y }

  sorted_arr
end

#text_around_arrow?(arrow, text, dist) ⇒ Boolean

Returns:

  • (Boolean)


130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb', line 130

def text_around_arrow?(arrow, text, dist)
  tpoly = text.polygon
  is_middle = arrow.poly_in_middle?(text.polygon)
  return false unless is_middle

  pheight = [tpoly.width, tpoly.height].max
  arrow.build_polygons(pheight + dist)
  cur_height = arrow.height
  arrow.build_polygons(cur_height)

  tcenter = tpoly.center
  reaction = @reactions.detect { |r| r.arrow_id == arrow.id }
  arrow.segments.each do |aseg|
    pseg = aseg.perpen_segment_via_point(tcenter)
    check_contains = (
      aseg.contains_point?(pseg.point1) ||
      aseg.contains_point?(pseg.point2)
    )
    mol_ids = molecules_intersects_with_segment(pseg)
    mol_ids = mol_ids - reaction.reagent_ids
    return true if mol_ids.empty? && check_contains
  end

  false
end

#try_detect_label_position(text) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/chem_scanner/interpreter/reaction_detection/text_assignment.rb', line 72

def try_detect_label_position(text)
  return nil if text.value != text.bold_text

  group_pos = {}
  @reactions.each do |reaction|
    rid = reaction.arrow_id
    arrow = @arrow_map[rid]
    group = detect_position(arrow, text.polygon)
    next if group.nil?

    group_pos[rid] = group
  end

  return nil unless group_pos.size == 1

  pos = group_pos.values.first
  return nil unless %w[reactants products].include?(pos)

  group_pos
end