Class: Transformers::Distilbert::MultiHeadSelfAttention

Inherits:
Torch::NN::Module
  • Object
show all
Defined in:
lib/transformers/models/distilbert/modeling_distilbert.rb

Direct Known Subclasses

DistilBertFlashAttention2

Instance Method Summary collapse

Constructor Details

#initialize(config) ⇒ MultiHeadSelfAttention

Returns a new instance of MultiHeadSelfAttention.



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/transformers/models/distilbert/modeling_distilbert.rb', line 57

def initialize(config)
  super()
  @config = config

  @n_heads = config.n_heads
  @dim = config.dim
  @dropout = Torch::NN::Dropout.new(p: config.attention_dropout)
  @is_causal = false

  # Have an even number of multi heads that divide the dimensions
  if @dim % @n_heads != 0
    # Raise value errors for even multi-head attention nodes
    raise ArgumentError, "self.n_heads: #{@n_heads} must divide self.dim: #{@dim} evenly"
  end

  @q_lin = Torch::NN::Linear.new(config.dim, config.dim)
  @k_lin = Torch::NN::Linear.new(config.dim, config.dim)
  @v_lin = Torch::NN::Linear.new(config.dim, config.dim)
  @out_lin = Torch::NN::Linear.new(config.dim, config.dim)

  @pruned_heads = Set.new
  @attention_head_size = @dim.div(@n_heads)
end

Instance Method Details

#forward(query:, key:, value:, mask:, head_mask: nil, output_attentions: false) ⇒ Object



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/transformers/models/distilbert/modeling_distilbert.rb', line 88

def forward(
  query:,
  key:,
  value:,
  mask:,
  head_mask: nil,
  output_attentions: false
)
  bs, _q_length, dim = query.size
  k_length = key.size(1)
  if dim != @dim
    raise "Dimensions do not match: #{dim} input vs #{@dim} configured"
  end
  if key.size != value.size
    raise Todo
  end

  dim_per_head = @dim.div(@n_heads)

  mask_reshp = [bs, 1, 1, k_length]

  shape = lambda do |x|
    x.view(bs, -1, @n_heads, dim_per_head).transpose(1, 2)
  end

  unshape = lambda do |x|
    x.transpose(1, 2).contiguous.view(bs, -1, @n_heads * dim_per_head)
  end

  q = shape.(@q_lin.(query))  # (bs, n_heads, q_length, dim_per_head)
  k = shape.(@k_lin.(key))  # (bs, n_heads, k_length, dim_per_head)
  v = shape.(@v_lin.(value))  # (bs, n_heads, k_length, dim_per_head)

  q = q / Math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
  scores = Torch.matmul(q, k.transpose(2, 3))  # (bs, n_heads, q_length, k_length)
  mask = (mask.eq(0)).view(mask_reshp).expand_as(scores)  # (bs, n_heads, q_length, k_length)
  scores =
    scores.masked_fill(
      # TODO use Torch.finfo
      mask, Torch.tensor(0)
    )  # (bs, n_heads, q_length, k_length)

  weights = Torch::NN::Functional.softmax(scores, dim: -1)  # (bs, n_heads, q_length, k_length)
  weights = @dropout.(weights)  # (bs, n_heads, q_length, k_length)

  # Mask heads if we want to
  if !head_mask.nil?
    weights = weights * head_mask
  end

  context = Torch.matmul(weights, v)  # (bs, n_heads, q_length, dim_per_head)
  context = unshape.(context)  # (bs, q_length, dim)
  context = @out_lin.(context)  # (bs, q_length, dim)

  if output_attentions
    [context, weights]
  else
    [context]
  end
end

#prune_heads(heads) ⇒ Object

Raises:



81
82
83
84
85
86
# File 'lib/transformers/models/distilbert/modeling_distilbert.rb', line 81

def prune_heads(heads)
  if heads.length == 0
    return
  end
  raise Todo
end