Module: Informers::Utils

Defined in:
lib/informers/utils/hub.rb,
lib/informers/processors.rb,
lib/informers/tokenizers.rb,
lib/informers/utils/core.rb,
lib/informers/utils/math.rb,
lib/informers/utils/audio.rb,
lib/informers/utils/image.rb,
lib/informers/utils/ffmpeg.rb,
lib/informers/utils/tensor.rb,
lib/informers/utils/generation.rb

Defined Under Namespace

Modules: Hub Classes: BeamSearchSampler, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, GenerationConfig, GreedySampler, LogitsProcessor, LogitsProcessorList, MinLengthLogitsProcessor, NoRepeatNGramLogitsProcessor, RawImage, Sampler

Class Method Summary collapse

Class Method Details

._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs) ⇒ Object



216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/informers/tokenizers.rb', line 216

def self._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs)
  if !slf.respond_to?(:language_codes) || !slf.language_codes.is_a?(Array)
    raise Error, "Tokenizer must have `language_codes` attribute set and it should be an array of language ids."
  end
  if !slf.respond_to?(:language_regex) || !slf.language_regex.is_a?(Regexp)
    raise Error, "Tokenizer must have `language_regex` attribute set and it should be a regular expression."
  end
  if !slf.respond_to?(:lang_to_token) || !slf.lang_to_token.respond_to?(:call)
    raise Error, "Tokenizer must have `lang_to_token` attribute set and it should be a function."
  end
  src_lang_token = generate_kwargs[:src_lang]
  tgt_lang_token = generate_kwargs[:tgt_lang]

  if !slf.language_codes.include?(tgt_lang_token)
    raise Error, "Target language code #{tgt_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
  end

  if !src_lang_token.nil?
    # Check that the source language is valid:
    if !slf.language_codes.include?(src_lang_token)
      raise Error, "Source language code #{src_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
    end
  end

  # Override the `forced_bos_token_id` to force the correct language
  generate_kwargs["forced_bos_token_id"] = slf.convert_tokens_to_ids([slf.lang_to_token.(tgt_lang_token)])[0]

  slf.(raw_inputs, **tokenizer_options)
end

.calculate_reflect_offset(i, w) ⇒ Object



7
8
9
# File 'lib/informers/utils/core.rb', line 7

def self.calculate_reflect_offset(i, w)
  ((i + w) % (2 * w) - w).abs
end

.center_to_corners_format(v) ⇒ Object



658
659
660
661
662
663
664
665
666
# File 'lib/informers/processors.rb', line 658

def self.center_to_corners_format(v)
  centerX, centerY, width, height = v
  [
    centerX - width / 2.0,
    centerY - height / 2.0,
    centerX + width / 2.0,
    centerY + height / 2.0
  ]
end

.dims(tensor) ⇒ Object



37
38
39
40
41
42
43
44
# File 'lib/informers/utils/tensor.rb', line 37

def self.dims(tensor)
  dims = []
  while tensor.is_a?(Array)
    dims << tensor.size
    tensor = tensor[0]
  end
  dims
end

.dispatch_callback(progress_callback, data) ⇒ Object



3
4
5
# File 'lib/informers/utils/core.rb', line 3

def self.dispatch_callback(progress_callback, data)
  progress_callback.(data) if progress_callback
end

.ffmpeg_read(data, sampling_rate) ⇒ Object

from the Transformers Python library



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/informers/utils/ffmpeg.rb', line 18

def self.ffmpeg_read(data, sampling_rate)
  ar = "#{sampling_rate}"
  ac = "1"
  format_for_conversion = "f32le"
  ffmpeg_command = [
    "ffmpeg",
    "-i",
    "pipe:0",
    "-ac",
    ac,
    "-ar",
    ar,
    "-f",
    format_for_conversion,
    "-hide_banner",
    "-loglevel",
    "quiet",
    "pipe:1"
  ]

  stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
  if !status.success?
    raise Error, "ffmpeg was not found but is required to load audio files from filename"
  end
  stdout.unpack("e*")
end

.get_top_items(items, top_k = 0) ⇒ Object



96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/informers/utils/math.rb', line 96

def self.get_top_items(items, top_k = 0)
  # if top == 0, return all

  items = items
    .map.with_index { |x, i| [i, x] } # Get indices ([index, score])
    .sort_by { |v| -v[1] }            # Sort by log probabilities

  if !top_k.nil? && top_k > 0
    items = items.slice(0, top_k)     # Get top k items
  end

  items
end

.interpolate(input, shape, mode = "bilinear", align_corners = false) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/informers/utils/tensor.rb', line 46

def self.interpolate(input, shape, mode = "bilinear", align_corners = false)
  out_height, out_width = shape

  # Input image dimensions
  in_channels = dims(input)[-3] || 1
  in_height = dims(input)[-2]
  in_width = dims(input)[-1]

  output = interpolate_data(
    input.flatten,
    [in_channels, in_height, in_width],
    [out_height, out_width],
    mode,
    align_corners
  )
  reshape(output, [in_channels, out_height, out_width])
end

.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# File 'lib/informers/utils/math.rb', line 3

def self.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false)
  in_channels, in_height, in_width = in_shape
  out_height, out_width = out_shape

  # TODO use mode and align_corners

  # Output image dimensions
  x_scale = out_width / in_width.to_f
  y_scale = out_height / in_height.to_f

  # Output image
  out_img = Array.new(out_height * out_width * in_channels)

  # Pre-calculate strides
  in_stride = in_height * in_width
  out_stride = out_height * out_width

  out_height.times do |i|
    out_width.times do |j|
      # Calculate output offset
      out_offset = i * out_width + j

      # Calculate input pixel coordinates
      x = (j + 0.5) / x_scale - 0.5
      y = (i + 0.5) / y_scale - 0.5

      # Calculate the four nearest input pixels
      # We also check if the input pixel coordinates are within the image bounds
      x1 = x.floor
      y1 = y.floor
      x2 = [x1 + 1, in_width - 1].min
      y2 = [y1 + 1, in_height - 1].min

      x1 = [x1, 0].max
      y1 = [y1, 0].max

      # Calculate the fractional distances between the input pixel and the four nearest pixels
      s = x - x1
      t = y - y1

      # Perform bilinear interpolation
      w1 = (1 - s) * (1 - t)
      w2 = s * (1 - t)
      w3 = (1 - s) * t
      w4 = s * t

      # Calculate the four nearest input pixel indices
      y_stride = y1 * in_width
      x_stride = y2 * in_width
      idx1 = y_stride + x1
      idx2 = y_stride + x2
      idx3 = x_stride + x1
      idx4 = x_stride + x2

      in_channels.times do |k|
        # Calculate channel offset
        c_offset = k * in_stride

        out_img[k * out_stride + out_offset] =
          w1 * input[c_offset + idx1] +
          w2 * input[c_offset + idx2] +
          w3 * input[c_offset + idx3] +
          w4 * input[c_offset + idx4]
      end
    end
  end

  out_img
end

.max(arr) ⇒ Object



110
111
112
113
114
115
# File 'lib/informers/utils/math.rb', line 110

def self.max(arr)
  if arr.length == 0
    raise Error, "Array must not be empty"
  end
  arr.map.with_index.max_by { |v, _| v }
end

.mean_pooling(last_hidden_state, attention_mask) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# File 'lib/informers/utils/tensor.rb', line 3

def self.mean_pooling(last_hidden_state, attention_mask)
  last_hidden_state.zip(attention_mask).map do |state, mask|
    state[0].size.times.map do |k|
      sum = 0.0
      count = 0

      state.zip(mask) do |s, m|
        count += m
        sum += s[k] * m
      end

      sum / count
    end
  end
end

.normalize(result) ⇒ Object



19
20
21
22
23
24
# File 'lib/informers/utils/tensor.rb', line 19

def self.normalize(result)
  result.map do |row|
    norm = Math.sqrt(row.sum { |v| v * v })
    row.map { |v| v / norm }
  end
end

.ones_like(tensor) ⇒ Object



30
31
32
33
34
35
# File 'lib/informers/utils/tensor.rb', line 30

def self.ones_like(tensor)
  if tensor[0].is_a?(Array)
    return tensor.map { |v| ones_like(v) }
  end
  tensor.map { |_| 1 }
end

.post_process_object_detection(outputs, threshold = 0.5, target_sizes = nil, is_zero_shot = false) ⇒ Object



668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
# File 'lib/informers/processors.rb', line 668

def self.post_process_object_detection(outputs, threshold = 0.5, target_sizes = nil, is_zero_shot = false)
  out_logits = outputs[:logits]
  out_bbox = outputs[:pred_boxes]
  batch_size, num_boxes, num_classes = out_logits.size, out_logits[0].size, out_logits[0][0].size

  if !target_sizes.nil? && target_sizes.length != batch_size
    raise Error, "Make sure that you pass in as many target sizes as the batch dimension of the logits"
  end
  to_return = []
  batch_size.times do |i|
    target_size = !target_sizes.nil? ? target_sizes[i] : nil
    info = {
      boxes: [],
      classes: [],
      scores: []
    }
    logits = out_logits[i]
    bbox = out_bbox[i]

    num_boxes.times do |j|
      logit = logits[j]

      indices = []
      if is_zero_shot
        # Get indices of classes with high enough probability
        probs = Utils.sigmoid(logit)
        probs.length.times do |k|
          if probs[k] > threshold
            indices << k
          end
        end
      else
        # Get most probable class
        max_index = Utils.max(logit)[1]

        if max_index == num_classes - 1
          # This is the background class, skip it
          next
        end
        indices << max_index

        # Compute softmax over classes
        probs = Utils.softmax(logit)
      end

      indices.each do |index|
        box = bbox[j]

        # convert to [x0, y0, x1, y1] format
        box = center_to_corners_format(box)
        if !target_size.nil?
          box = box.map.with_index { |x, i| x * target_size[(i + 1) % 2] }
        end

        info[:boxes] << box
        info[:classes] << index
        info[:scores] << probs[index]
      end
    end
    to_return << info
  end
  to_return
end

.read_audio(input, sampling_rate) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
# File 'lib/informers/utils/audio.rb', line 3

def self.read_audio(input, sampling_rate)
  data =
    if input.is_a?(URI)
      require "open-uri"

      input.read
    elsif input.is_a?(String)
      File.binread(input)
    else
      raise ArgumentError, "Unsupported input type: #{input.class.name}"
    end

  ffmpeg_read(data, sampling_rate)
end

.reshape(arr, dims) ⇒ Object



64
65
66
67
68
69
70
# File 'lib/informers/utils/tensor.rb', line 64

def self.reshape(arr, dims)
  arr = arr.flatten
  dims[1..-1].reverse_each do |dim|
    arr = arr.each_slice(dim)
  end
  arr.to_a
end

.sigmoid(arr) ⇒ Object



89
90
91
92
93
94
# File 'lib/informers/utils/math.rb', line 89

def self.sigmoid(arr)
  if arr[0].is_a?(Array)
    return arr.map { |a| sigmoid(a) }
  end
  arr.map { |v| 1 / (1 + Math.exp(-v)) }
end

.softmax(arr) ⇒ Object



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/informers/utils/math.rb', line 73

def self.softmax(arr)
  # Compute the maximum value in the array
  max_val = arr.max

  #  Compute the exponentials of the array values
  exps = arr.map { |x| Math.exp(x - max_val) }

  # Compute the sum of the exponentials
  sum_exps = exps.sum

  # Compute the softmax values
  softmax_arr = exps.map { |x| x / sum_exps }

  softmax_arr
end

.stack(tensors, dim = 0) ⇒ Object



26
27
28
# File 'lib/informers/utils/tensor.rb', line 26

def self.stack(tensors, dim = 0)
  tensors
end