Module: ChemScanner::Interpreter::BoldGroup

Included in:: MoleculeTextGroup, ReactionTextGroup

Defined in:: lib/chem_scanner/interpreter/text_group/bold_groups.rb

Instance Method Summary collapse

Instance Method Details

#extract_alphabet_number(text) ⇒ `Object`

# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 128

def extract_alphabet_number(text)
  # 3a,b -> 3a, 3b
  regex = /(\d+)( *[a-z],*)+/
  res = regex =~ text
  return [] if res.nil?

  els = text.split(",")
  anchor = els.first.strip.scan(/\d+/).first
  els[0] = els[0].gsub(anchor, "")

  els.reduce([]) do |arr, char|
    arr.push(anchor + char.strip)
  end
end

#extract_range_number(text) ⇒ `Object`

# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 116

def extract_range_number(text)
  # 3-6 -> 3,4,5,6
  regex = /(\d+)-(\d+)/
  res = text.scan(regex)
  return [] if res.empty?

  bnum, enum = res.first
  return [] if bnum >= enum

  (bnum..enum).to_a
end

#group_combinations(rgroup) ⇒ `Object`

# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 143

def group_combinations(rgroup)
  return [] if rgroup.empty?

  combis = []

  key_arr = rgroup.keys
  group_num = key_arr.size

  # keep track of next element in each of the R-groups substitutions
  indices = Array.new(group_num, 0)

  loop do
    combi = {}
    indices.each_with_index.each do |val, idx|
      group = key_arr[idx]
      substitute = rgroup[group][val]

      combi[group] = substitute
    end
    combis.push(combi)

    group_max_idx = group_num - 1

    # rightmost array, has more elements left after the current element
    next_group_idx = group_max_idx
    next_group_size = rgroup[key_arr[next_group_idx]].size
    while next_group_idx >= 0 &&
        (indices[next_group_idx] + 1 >= next_group_size)
      next_group_idx -= 1
      next_group_size = rgroup[key_arr[next_group_idx]].size
    end

    return combis if next_group_idx < 0

    indices[next_group_idx] += 1
    (next_group_idx + 1..group_max_idx).each { |x| indices[x] = 0 }
  end
end

#line_bold_groups(line, target_groups) ⇒ `Object`

# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 43

def line_bold_groups(line, target_groups)
  bold_regex = /\*\*([^\*\*]*)\*\*/
  bold = line.scan(bold_regex).map(&:first).map(&:strip)
  bold.reject! do |x|
    bold.select { |y| (y.size > x.size) && y.include?(x) }.count > 0
  end

  group_or = "(" + target_groups.join("|") + ")"
  group_regex = /#{group_or} *=/
  res = line.enum_for(:scan, group_regex)
  positions = res.map { Regexp.last_match.begin(0) }

  text_arr = positions.map.with_index do |pos, idx|
    end_pos = idx == (positions.size - 1) ? line.size : positions[idx + 1]
    rtext = line[pos, end_pos - pos]
    regex = /#{group_or} *= *([^\*\*])*(?=$|\n|\.|\z|\Z|\*\*)/
    rtext[regex].strip
  end

  groups = text_arr.reduce({}) do |acc, gtext|
    group_val = gtext.scan(/#{group_or}? ?(?==)/)
    temp = gtext.split("=", 2).last.split(",").map do |t|
      t.strip.gsub(/^-/, "")
    end
    substitutes = temp.compact.uniq.select do |text|
      is_superatom = !ChemScanner.get_superatom(text).empty?
      is_abb = !ChemScanner.get_abbreviation(text).empty?
      is_n_atom = /^\d+$/.match?(text)

      is_superatom || is_abb || is_n_atom
    end
    next acc if group_val.empty? || substitutes.empty?

    info = { group_val.first.first.strip => substitutes }
    acc.merge(info) { |_, cur, new| cur.concat(new).compact.uniq }
  end

  [bold, groups]
end

#normalize_bold(bold) ⇒ `Object`

# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 102

def normalize_bold(bold)
  arr = []

  # 1-3 or 5-10 ... => [1,2,3] or [5,6,7,8,9,10]
  range = extract_range_number(bold)
  return arr.concat(range) unless range.empty?

  # 1a,b or 8a,b,c ... => [1a, 1b] or [8a,8b,8c]
  range = extract_alphabet_number(bold)
  return arr.concat(range) unless range.empty?

  [bold]
end

#normalize_bold_groups(bolds, groups) ⇒ `Object`

# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 83

def normalize_bold_groups(bolds, groups)
  normalized = []

  bolds.each_with_index do |bold, idx|
    bgroup = {}
    groups.each { |k, v| bgroup[k] = v[idx] unless v[idx].nil? }
    norm_bolds = bold.split(",").reduce([]) do |arr, b|
      nbold = normalize_bold(b.strip)
      arr.concat(nbold)
    end

    norm_bolds.each do |b|
      normalized.push(bold: b, group: bgroup)
    end
  end

  normalized
end

#text_bold_groups(tid) ⇒ `Object`

# File 'lib/chem_scanner/interpreter/text_group/bold_groups.rb', line 8

def text_bold_groups(tid)
  all_groups = @alias_info.values.reduce([]) do |arr, infos|
    groups = infos.map { |info| info[:group] }
    arr.concat(groups).uniq
  end

  # In case "** \n", splitting results
  # "[...**], [bold text** ...]"
  # The bold part will be missed for next line
  text = @text_map[tid].markdown.gsub(/ *\*\* *\n/, "\n**")
  lines = text.split("\n")

  bold_arr = []
  group_info = {}

  lines.each do |line|
    bold_info, groups = line_bold_groups(line, all_groups)
    next if bold_info.empty? && groups.empty?

    list_bold = bold_info.reduce([]) do |arr, bold|
      # Remove ":" from "1:", or "2a,b:"
      bold.gsub!(":", "")
      norm = normalize_bold(bold)
      barr = norm.empty? ? bold : norm
      arr.push(barr)
    end

    bold_list = list_bold.flatten.reject { |b| /^[a-z]+$/.match?(b) }
    bold_arr.concat(bold_list)
    group_info.merge!(groups) { |_, old, new| old.concat(new) }
  end

  [bold_arr, group_info]
end

Module: ChemScanner::Interpreter::BoldGroup

Instance Method Summary collapse

Instance Method Details

#extract_alphabet_number(text) ⇒ Object

#extract_range_number(text) ⇒ Object

#group_combinations(rgroup) ⇒ Object

#line_bold_groups(line, target_groups) ⇒ Object

#normalize_bold(bold) ⇒ Object

#normalize_bold_groups(bolds, groups) ⇒ Object

#text_bold_groups(tid) ⇒ Object

#extract_alphabet_number(text) ⇒ `Object`

#extract_range_number(text) ⇒ `Object`

#group_combinations(rgroup) ⇒ `Object`

#line_bold_groups(line, target_groups) ⇒ `Object`

#normalize_bold(bold) ⇒ `Object`

#normalize_bold_groups(bolds, groups) ⇒ `Object`

#text_bold_groups(tid) ⇒ `Object`