Class: Transformers::Distilbert::MultiHeadSelfAttention

Inherits:

Torch::NN::Module

Object
Torch::NN::Module
Transformers::Distilbert::MultiHeadSelfAttention

show all

Defined in:: lib/transformers/models/distilbert/modeling_distilbert.rb

Direct Known Subclasses

DistilBertFlashAttention2

Instance Method Summary collapse

#forward(query:, key:, value:, mask:, head_mask: nil, output_attentions: false) ⇒ Object
#initialize(config) ⇒ MultiHeadSelfAttention constructor

A new instance of MultiHeadSelfAttention.
#prune_heads(heads) ⇒ Object

Constructor Details

#initialize(config) ⇒ `MultiHeadSelfAttention`

Returns a new instance of MultiHeadSelfAttention.

# File 'lib/transformers/models/distilbert/modeling_distilbert.rb', line 57

def initialize(config)
  super()
  @config = config

  @n_heads = config.n_heads
  @dim = config.dim
  @dropout = Torch::NN::Dropout.new(p: config.attention_dropout)
  @is_causal = false

  # Have an even number of multi heads that divide the dimensions
  if @dim % @n_heads != 0
    # Raise value errors for even multi-head attention nodes
    raise ArgumentError, "self.n_heads: #{@n_heads} must divide self.dim: #{@dim} evenly"
  end

  @q_lin = Torch::NN::Linear.new(config.dim, config.dim)
  @k_lin = Torch::NN::Linear.new(config.dim, config.dim)
  @v_lin = Torch::NN::Linear.new(config.dim, config.dim)
  @out_lin = Torch::NN::Linear.new(config.dim, config.dim)

  @pruned_heads = Set.new
  @attention_head_size = @dim.div(@n_heads)
end

Instance Method Details

#forward(query:, key:, value:, mask:, head_mask: nil, output_attentions: false) ⇒ `Object`

# File 'lib/transformers/models/distilbert/modeling_distilbert.rb', line 88

def forward(
  query:,
  key:,
  value:,
  mask:,
  head_mask: nil,
  output_attentions: false
)
  bs, _q_length, dim = query.size
  k_length = key.size(1)
  if dim != @dim
    raise "Dimensions do not match: #{dim} input vs #{@dim} configured"
  end
  if key.size != value.size
    raise Todo
  end

  dim_per_head = @dim.div(@n_heads)

  mask_reshp = [bs, 1, 1, k_length]

  shape = lambda do |x|
    x.view(bs, -1, @n_heads, dim_per_head).transpose(1, 2)
  end

  unshape = lambda do |x|
    x.transpose(1, 2).contiguous.view(bs, -1, @n_heads * dim_per_head)
  end

  q = shape.(@q_lin.(query))  # (bs, n_heads, q_length, dim_per_head)
  k = shape.(@k_lin.(key))  # (bs, n_heads, k_length, dim_per_head)
  v = shape.(@v_lin.(value))  # (bs, n_heads, k_length, dim_per_head)

  q = q / Math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
  scores = Torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
  mask = (mask.eq(0)).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
  scores =
    scores.masked_fill(
      # TODO use Torch.finfo
      mask, Torch.tensor(0)
    )  # (bs, n_heads, q_length, k_length)

  weights = Torch::NN::Functional.softmax(scores, dim: -1)  # (bs, n_heads, q_length, k_length)
  weights = @dropout.(weights)  # (bs, n_heads, q_length, k_length)

  # Mask heads if we want to
  if !head_mask.nil?
    weights = weights * head_mask
  end

  context = Torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
  context = unshape.(context)  # (bs, q_length, dim)
  context = @out_lin.(context)  # (bs, q_length, dim)

  if output_attentions
    [context, weights]
  else
    [context]
  end
end

#prune_heads(heads) ⇒ `Object`

Raises:

(Todo)

# File 'lib/transformers/models/distilbert/modeling_distilbert.rb', line 81

def prune_heads(heads)
  if heads.length == 0
    return
  end
  raise Todo
end

Class: Transformers::Distilbert::MultiHeadSelfAttention

Direct Known Subclasses

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ MultiHeadSelfAttention

Instance Method Details

#forward(query:, key:, value:, mask:, head_mask: nil, output_attentions: false) ⇒ Object

#prune_heads(heads) ⇒ Object

#initialize(config) ⇒ `MultiHeadSelfAttention`

#forward(query:, key:, value:, mask:, head_mask: nil, output_attentions: false) ⇒ `Object`

#prune_heads(heads) ⇒ `Object`