Module: Informers::Utils

Defined in:: lib/informers/utils/hub.rb,
lib/informers/processors.rb,
lib/informers/tokenizers.rb,
lib/informers/utils/core.rb,
lib/informers/utils/math.rb,
lib/informers/utils/audio.rb,
lib/informers/utils/image.rb,
lib/informers/utils/ffmpeg.rb,
lib/informers/utils/tensor.rb,
lib/informers/utils/generation.rb

Defined Under Namespace

Modules: Hub Classes: BeamSearchSampler, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, GenerationConfig, GreedySampler, LogitsProcessor, LogitsProcessorList, MinLengthLogitsProcessor, NoRepeatNGramLogitsProcessor, RawImage, Sampler

Class Method Summary collapse

Class Method Details

._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs) ⇒ `Object`

# File 'lib/informers/tokenizers.rb', line 216

def self._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs)
  if !slf.respond_to?(:language_codes) || !slf.language_codes.is_a?(Array)
    raise Error, "Tokenizer must have `language_codes` attribute set and it should be an array of language ids."
  end
  if !slf.respond_to?(:language_regex) || !slf.language_regex.is_a?(Regexp)
    raise Error, "Tokenizer must have `language_regex` attribute set and it should be a regular expression."
  end
  if !slf.respond_to?(:lang_to_token) || !slf.lang_to_token.respond_to?(:call)
    raise Error, "Tokenizer must have `lang_to_token` attribute set and it should be a function."
  end
  src_lang_token = generate_kwargs[:src_lang]
  tgt_lang_token = generate_kwargs[:tgt_lang]

  if !slf.language_codes.include?(tgt_lang_token)
    raise Error, "Target language code #{tgt_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
  end

  if !src_lang_token.nil?
    # Check that the source language is valid:
    if !slf.language_codes.include?(src_lang_token)
      raise Error, "Source language code #{src_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
    end
  end

  # Override the `forced_bos_token_id` to force the correct language
  generate_kwargs["forced_bos_token_id"] = slf.convert_tokens_to_ids([slf.lang_to_token.(tgt_lang_token)])[0]

  slf.(raw_inputs, **tokenizer_options)
end

.calculate_reflect_offset(i, w) ⇒ `Object`



7
8
9

# File 'lib/informers/utils/core.rb', line 7

def self.calculate_reflect_offset(i, w)
  ((i + w) % (2 * w) - w).abs
end

.center_to_corners_format(v) ⇒ `Object`

# File 'lib/informers/processors.rb', line 658

def self.center_to_corners_format(v)
  centerX, centerY, width, height = v
  [
    centerX - width / 2.0,
    centerY - height / 2.0,
    centerX + width / 2.0,
    centerY + height / 2.0
  ]
end

.dims(tensor) ⇒ `Object`

# File 'lib/informers/utils/tensor.rb', line 37

def self.dims(tensor)
  dims = []
  while tensor.is_a?(Array)
    dims << tensor.size
    tensor = tensor[0]
  end
  dims
end

.dispatch_callback(progress_callback, data) ⇒ `Object`



3
4
5

# File 'lib/informers/utils/core.rb', line 3

def self.dispatch_callback(progress_callback, data)
  progress_callback.(data) if progress_callback
end

.ffmpeg_read(data, sampling_rate) ⇒ `Object`

from the Transformers Python library

# File 'lib/informers/utils/ffmpeg.rb', line 18

def self.ffmpeg_read(data, sampling_rate)
  ar = "#{sampling_rate}"
  ac = "1"
  format_for_conversion = "f32le"
  ffmpeg_command = [
    "ffmpeg",
    "-i",
    "pipe:0",
    "-ac",
    ac,
    "-ar",
    ar,
    "-f",
    format_for_conversion,
    "-hide_banner",
    "-loglevel",
    "quiet",
    "pipe:1"
  ]

  stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
  if !status.success?
    raise Error, "ffmpeg was not found but is required to load audio files from filename"
  end
  stdout.unpack("e*")
end

.get_top_items(items, top_k = 0) ⇒ `Object`

# File 'lib/informers/utils/math.rb', line 96

def self.get_top_items(items, top_k = 0)
  # if top == 0, return all

  items = items
    .map.with_index { |x, i| [i, x] } # Get indices ([index, score])
    .sort_by { |v| -v[1] }            # Sort by log probabilities

  if !top_k.nil? && top_k > 0
    items = items.slice(0, top_k)     # Get top k items
  end

  items
end

.interpolate(input, shape, mode = "bilinear", align_corners = false) ⇒ `Object`

# File 'lib/informers/utils/tensor.rb', line 46

def self.interpolate(input, shape, mode = "bilinear", align_corners = false)
  out_height, out_width = shape

  # Input image dimensions
  in_channels = dims(input)[-3] || 1
  in_height = dims(input)[-2]
  in_width = dims(input)[-1]

  output = interpolate_data(
    input.flatten,
    [in_channels, in_height, in_width],
    [out_height, out_width],
    mode,
    align_corners
  )
  reshape(output, [in_channels, out_height, out_width])
end

.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false) ⇒ `Object`

# File 'lib/informers/utils/math.rb', line 3

def self.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false)
  in_channels, in_height, in_width = in_shape
  out_height, out_width = out_shape

  # TODO use mode and align_corners

  # Output image dimensions
  x_scale = out_width / in_width.to_f
  y_scale = out_height / in_height.to_f

  # Output image
  out_img = Array.new(out_height * out_width * in_channels)

  # Pre-calculate strides
  in_stride = in_height * in_width
  out_stride = out_height * out_width

  out_height.times do |i|
    out_width.times do |j|
      # Calculate output offset
      out_offset = i * out_width + j

      # Calculate input pixel coordinates
      x = (j + 0.5) / x_scale - 0.5
      y = (i + 0.5) / y_scale - 0.5

      # Calculate the four nearest input pixels
      # We also check if the input pixel coordinates are within the image bounds
      x1 = x.floor
      y1 = y.floor
      x2 = [x1 + 1, in_width - 1].min
      y2 = [y1 + 1, in_height - 1].min

      x1 = [x1, 0].max
      y1 = [y1, 0].max

      # Calculate the fractional distances between the input pixel and the four nearest pixels
      s = x - x1
      t = y - y1

      # Perform bilinear interpolation
      w1 = (1 - s) * (1 - t)
      w2 = s * (1 - t)
      w3 = (1 - s) * t
      w4 = s * t

      # Calculate the four nearest input pixel indices
      y_stride = y1 * in_width
      x_stride = y2 * in_width
      idx1 = y_stride + x1
      idx2 = y_stride + x2
      idx3 = x_stride + x1
      idx4 = x_stride + x2

      in_channels.times do |k|
        # Calculate channel offset
        c_offset = k * in_stride

        out_img[k * out_stride + out_offset] =
          w1 * input[c_offset + idx1] +
          w2 * input[c_offset + idx2] +
          w3 * input[c_offset + idx3] +
          w4 * input[c_offset + idx4]
      end
    end
  end

  out_img
end

.max(arr) ⇒ `Object`

# File 'lib/informers/utils/math.rb', line 110

def self.max(arr)
  if arr.length == 0
    raise Error, "Array must not be empty"
  end
  arr.map.with_index.max_by { |v, _| v }
end

.mean_pooling(last_hidden_state, attention_mask) ⇒ `Object`

# File 'lib/informers/utils/tensor.rb', line 3

def self.mean_pooling(last_hidden_state, attention_mask)
  last_hidden_state.zip(attention_mask).map do |state, mask|
    state[0].size.times.map do |k|
      sum = 0.0
      count = 0

      state.zip(mask) do |s, m|
        count += m
        sum += s[k] * m
      end

      sum / count
    end
  end
end

.normalize(result) ⇒ `Object`

# File 'lib/informers/utils/tensor.rb', line 19

def self.normalize(result)
  result.map do |row|
    norm = Math.sqrt(row.sum { |v| v * v })
    row.map { |v| v / norm }
  end
end

.ones_like(tensor) ⇒ `Object`

# File 'lib/informers/utils/tensor.rb', line 30

def self.ones_like(tensor)
  if tensor[0].is_a?(Array)
    return tensor.map { |v| ones_like(v) }
  end
  tensor.map { |_| 1 }
end

.post_process_object_detection(outputs, threshold = 0.5, target_sizes = nil, is_zero_shot = false) ⇒ `Object`

# File 'lib/informers/processors.rb', line 668

def self.post_process_object_detection(outputs, threshold = 0.5, target_sizes = nil, is_zero_shot = false)
  out_logits = outputs[:logits]
  out_bbox = outputs[:pred_boxes]
  batch_size, num_boxes, num_classes = out_logits.size, out_logits[0].size, out_logits[0][0].size

  if !target_sizes.nil? && target_sizes.length != batch_size
    raise Error, "Make sure that you pass in as many target sizes as the batch dimension of the logits"
  end
  to_return = []
  batch_size.times do |i|
    target_size = !target_sizes.nil? ? target_sizes[i] : nil
    info = {
      boxes: [],
      classes: [],
      scores: []
    }
    logits = out_logits[i]
    bbox = out_bbox[i]

    num_boxes.times do |j|
      logit = logits[j]

      indices = []
      if is_zero_shot
        # Get indices of classes with high enough probability
        probs = Utils.sigmoid(logit)
        probs.length.times do |k|
          if probs[k] > threshold
            indices << k
          end
        end
      else
        # Get most probable class
        max_index = Utils.max(logit)[1]

        if max_index == num_classes - 1
          # This is the background class, skip it
          next
        end
        indices << max_index

        # Compute softmax over classes
        probs = Utils.softmax(logit)
      end

      indices.each do |index|
        box = bbox[j]

        # convert to [x0, y0, x1, y1] format
        box = center_to_corners_format(box)
        if !target_size.nil?
          box = box.map.with_index { |x, i| x * target_size[(i + 1) % 2] }
        end

        info[:boxes] << box
        info[:classes] << index
        info[:scores] << probs[index]
      end
    end
    to_return << info
  end
  to_return
end

.read_audio(input, sampling_rate) ⇒ `Object`

# File 'lib/informers/utils/audio.rb', line 3

def self.read_audio(input, sampling_rate)
  data =
    if input.is_a?(URI)
      require "open-uri"

      input.read
    elsif input.is_a?(String)
      File.binread(input)
    else
      raise ArgumentError, "Unsupported input type: #{input.class.name}"
    end

  ffmpeg_read(data, sampling_rate)
end

.reshape(arr, dims) ⇒ `Object`

# File 'lib/informers/utils/tensor.rb', line 64

def self.reshape(arr, dims)
  arr = arr.flatten
  dims[1..-1].reverse_each do |dim|
    arr = arr.each_slice(dim)
  end
  arr.to_a
end

.sigmoid(arr) ⇒ `Object`

# File 'lib/informers/utils/math.rb', line 89

def self.sigmoid(arr)
  if arr[0].is_a?(Array)
    return arr.map { |a| sigmoid(a) }
  end
  arr.map { |v| 1 / (1 + Math.exp(-v)) }
end

.softmax(arr) ⇒ `Object`

# File 'lib/informers/utils/math.rb', line 73

def self.softmax(arr)
  # Compute the maximum value in the array
  max_val = arr.max

  #  Compute the exponentials of the array values
  exps = arr.map { |x| Math.exp(x - max_val) }

  # Compute the sum of the exponentials
  sum_exps = exps.sum

  # Compute the softmax values
  softmax_arr = exps.map { |x| x / sum_exps }

  softmax_arr
end

.stack(tensors, dim = 0) ⇒ `Object`



26
27
28

# File 'lib/informers/utils/tensor.rb', line 26

def self.stack(tensors, dim = 0)
  tensors
end

Module: Informers::Utils

Defined Under Namespace

Class Method Summary collapse

Class Method Details

._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs) ⇒ Object

.calculate_reflect_offset(i, w) ⇒ Object

.center_to_corners_format(v) ⇒ Object

.dims(tensor) ⇒ Object

.dispatch_callback(progress_callback, data) ⇒ Object

.ffmpeg_read(data, sampling_rate) ⇒ Object

.get_top_items(items, top_k = 0) ⇒ Object

.interpolate(input, shape, mode = "bilinear", align_corners = false) ⇒ Object

.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false) ⇒ Object

.max(arr) ⇒ Object

.mean_pooling(last_hidden_state, attention_mask) ⇒ Object

.normalize(result) ⇒ Object

.ones_like(tensor) ⇒ Object

.post_process_object_detection(outputs, threshold = 0.5, target_sizes = nil, is_zero_shot = false) ⇒ Object

.read_audio(input, sampling_rate) ⇒ Object

.reshape(arr, dims) ⇒ Object

.sigmoid(arr) ⇒ Object

.softmax(arr) ⇒ Object

.stack(tensors, dim = 0) ⇒ Object

._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs) ⇒ `Object`

.calculate_reflect_offset(i, w) ⇒ `Object`

.center_to_corners_format(v) ⇒ `Object`

.dims(tensor) ⇒ `Object`

.dispatch_callback(progress_callback, data) ⇒ `Object`

.ffmpeg_read(data, sampling_rate) ⇒ `Object`

.get_top_items(items, top_k = 0) ⇒ `Object`

.interpolate(input, shape, mode = "bilinear", align_corners = false) ⇒ `Object`

.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false) ⇒ `Object`

.max(arr) ⇒ `Object`

.mean_pooling(last_hidden_state, attention_mask) ⇒ `Object`

.normalize(result) ⇒ `Object`

.ones_like(tensor) ⇒ `Object`

.post_process_object_detection(outputs, threshold = 0.5, target_sizes = nil, is_zero_shot = false) ⇒ `Object`

.read_audio(input, sampling_rate) ⇒ `Object`

.reshape(arr, dims) ⇒ `Object`

.sigmoid(arr) ⇒ `Object`

.softmax(arr) ⇒ `Object`

.stack(tensors, dim = 0) ⇒ `Object`