Class: Transformers::Vit::ViTSelfAttention

Inherits:

Torch::NN::Module

Object
Torch::NN::Module
Transformers::Vit::ViTSelfAttention

show all

Defined in:: lib/transformers/models/vit/modeling_vit.rb

Instance Method Summary collapse

#forward(hidden_states, head_mask: nil, output_attentions: false) ⇒ Object
#initialize(config) ⇒ ViTSelfAttention constructor

A new instance of ViTSelfAttention.
#transpose_for_scores(x) ⇒ Object

Constructor Details

#initialize(config) ⇒ `ViTSelfAttention`

Returns a new instance of ViTSelfAttention.

# File 'lib/transformers/models/vit/modeling_vit.rb', line 102

def initialize(config)
  super()
  if config.hidden_size % config.num_attention_heads != 0 && !config.instance_variable_defined?(:@embedding_size)
    raise ArgumentError,
      "The hidden size #{config.hidden_size} is not a multiple of the number of attention " +
      "heads #{config.num_attention_heads}."
  end

  @num_attention_heads = config.num_attention_heads
  @attention_head_size = (config.hidden_size / config.num_attention_heads).to_i
  @all_head_size = @num_attention_heads * @attention_head_size

  @query = Torch::NN::Linear.new(config.hidden_size, @all_head_size, bias: config.qkv_bias)
  @key = Torch::NN::Linear.new(config.hidden_size, @all_head_size, bias: config.qkv_bias)
  @value = Torch::NN::Linear.new(config.hidden_size, @all_head_size, bias: config.qkv_bias)

  @dropout = Torch::NN::Dropout.new(p: config.attention_probs_dropout_prob)
end

Instance Method Details

#forward(hidden_states, head_mask: nil, output_attentions: false) ⇒ `Object`

# File 'lib/transformers/models/vit/modeling_vit.rb', line 127

def forward(
  hidden_states, head_mask: nil, output_attentions: false
)
  mixed_query_layer = @query.(hidden_states)

  key_layer = transpose_for_scores(@key.(hidden_states))
  value_layer = transpose_for_scores(@value.(hidden_states))
  query_layer = transpose_for_scores(mixed_query_layer)

  # Take the dot product between "query" and "key" to get the raw attention scores.
  attention_scores = Torch.matmul(query_layer, key_layer.transpose(-1, -2))

  attention_scores = attention_scores / Math.sqrt(@attention_head_size)

  # Normalize the attention scores to probabilities.
  attention_probs = Torch::NN::Functional.softmax(attention_scores, dim: -1)

  # This is actually dropping out entire tokens to attend to, which might
  # seem a bit unusual, but is taken from the original Transformer paper.
  attention_probs = @dropout.(attention_probs)

  # Mask heads if we want to
  if !head_mask.nil?
    attention_probs = attention_probs * head_mask
  end

  context_layer = Torch.matmul(attention_probs, value_layer)

  context_layer = context_layer.permute(0, 2, 1, 3).contiguous
  new_context_layer_shape = context_layer.size[...-2] + [@all_head_size]
  context_layer = context_layer.view(new_context_layer_shape)

  outputs = output_attentions ? [context_layer, attention_probs] : [context_layer]

  outputs
end

#transpose_for_scores(x) ⇒ `Object`

# File 'lib/transformers/models/vit/modeling_vit.rb', line 121

def transpose_for_scores(x)
  new_x_shape = x.size[...-1] + [@num_attention_heads, @attention_head_size]
  x = x.view(new_x_shape)
  x.permute(0, 2, 1, 3)
end

Class: Transformers::Vit::ViTSelfAttention

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ ViTSelfAttention

Instance Method Details

#forward(hidden_states, head_mask: nil, output_attentions: false) ⇒ Object

#transpose_for_scores(x) ⇒ Object

#initialize(config) ⇒ `ViTSelfAttention`

#forward(hidden_states, head_mask: nil, output_attentions: false) ⇒ `Object`

#transpose_for_scores(x) ⇒ `Object`