Class: Transformers::Mpnet::MPNetSelfAttention

Inherits:

Torch::NN::Module

Object
Torch::NN::Module
Transformers::Mpnet::MPNetSelfAttention

show all

Defined in:: lib/transformers/models/mpnet/modeling_mpnet.rb

Instance Method Summary collapse

#forward(hidden_states, attention_mask: nil, head_mask: nil, position_bias: nil, output_attentions: false, **kwargs) ⇒ Object
#initialize(config) ⇒ MPNetSelfAttention constructor

A new instance of MPNetSelfAttention.
#transpose_for_scores(x) ⇒ Object

Constructor Details

#initialize(config) ⇒ `MPNetSelfAttention`

Returns a new instance of MPNetSelfAttention.

# File 'lib/transformers/models/mpnet/modeling_mpnet.rb', line 103

def initialize(config)
  super()
  if config.hidden_size % config.num_attention_heads != 0 && !config.instance_variable_defined?(:@embedding_size)
    raise ArgumentError, "The hidden size (#{config.hidden_size}) is not a multiple of the number of attention heads (#{config.num_attention_heads})"
  end

  @num_attention_heads = config.num_attention_heads
  @attention_head_size = (config.hidden_size / config.num_attention_heads).to_i
  @all_head_size = @num_attention_heads * @attention_head_size

  @q = Torch::NN::Linear.new(config.hidden_size, @all_head_size)
  @k = Torch::NN::Linear.new(config.hidden_size, @all_head_size)
  @v = Torch::NN::Linear.new(config.hidden_size, @all_head_size)
  @o = Torch::NN::Linear.new(config.hidden_size, config.hidden_size)

  @dropout = Torch::NN::Dropout.new(p: config.attention_probs_dropout_prob)
end

Instance Method Details

#forward(hidden_states, attention_mask: nil, head_mask: nil, position_bias: nil, output_attentions: false, **kwargs) ⇒ `Object`

# File 'lib/transformers/models/mpnet/modeling_mpnet.rb', line 127

def forward(
  hidden_states,
  attention_mask: nil,
  head_mask: nil,
  position_bias: nil,
  output_attentions: false,
  **kwargs
)
  q = @q.(hidden_states)
  k = @k.(hidden_states)
  v = @v.(hidden_states)

  q = transpose_for_scores(q)
  k = transpose_for_scores(k)
  v = transpose_for_scores(v)

  # Take the dot product between "query" and "key" to get the raw attention scores.
  attention_scores = Torch.matmul(q, k.transpose(-1, -2))
  attention_scores = attention_scores / Math.sqrt(@attention_head_size)

  # Apply relative position embedding (precomputed in MPNetEncoder) if provided.
  if !position_bias.nil?
    attention_scores += position_bias
  end

  if !attention_mask.nil?
    attention_scores = attention_scores + attention_mask
  end

  # Normalize the attention scores to probabilities.
  attention_probs = Torch::NN::Functional.softmax(attention_scores, dim: -1)

  attention_probs = @dropout.(attention_probs)

  if !head_mask.nil?
    attention_probs = attention_probs * head_mask
  end

  c = Torch.matmul(attention_probs, v)

  c = c.permute(0, 2, 1, 3).contiguous
  new_c_shape = c.size[...-2] + [@all_head_size]
  c = c.view(*new_c_shape)

  o = @o.(c)

  outputs = output_attentions ? [o, attention_probs] : [o]
  outputs
end

#transpose_for_scores(x) ⇒ `Object`

# File 'lib/transformers/models/mpnet/modeling_mpnet.rb', line 121

def transpose_for_scores(x)
  new_x_shape = x.size[...-1] + [@num_attention_heads, @attention_head_size]
  x = x.view(*new_x_shape)
  x.permute(0, 2, 1, 3)
end

Class: Transformers::Mpnet::MPNetSelfAttention

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ MPNetSelfAttention

Instance Method Details

#forward(hidden_states, attention_mask: nil, head_mask: nil, position_bias: nil, output_attentions: false, **kwargs) ⇒ Object

#transpose_for_scores(x) ⇒ Object

#initialize(config) ⇒ `MPNetSelfAttention`

#forward(hidden_states, attention_mask: nil, head_mask: nil, position_bias: nil, output_attentions: false, **kwargs) ⇒ `Object`

#transpose_for_scores(x) ⇒ `Object`