Class: Transformers::Mpnet::MPNetSelfAttention
- Inherits:
-
Torch::NN::Module
- Object
- Torch::NN::Module
- Transformers::Mpnet::MPNetSelfAttention
- Defined in:
- lib/transformers/models/mpnet/modeling_mpnet.rb
Instance Method Summary collapse
- #forward(hidden_states, attention_mask: nil, head_mask: nil, position_bias: nil, output_attentions: false, **kwargs) ⇒ Object
-
#initialize(config) ⇒ MPNetSelfAttention
constructor
A new instance of MPNetSelfAttention.
- #transpose_for_scores(x) ⇒ Object
Constructor Details
#initialize(config) ⇒ MPNetSelfAttention
Returns a new instance of MPNetSelfAttention.
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/transformers/models/mpnet/modeling_mpnet.rb', line 103 def initialize(config) super() if config.hidden_size % config.num_attention_heads != 0 && !config.instance_variable_defined?(:@embedding_size) raise ArgumentError, "The hidden size (#{config.hidden_size}) is not a multiple of the number of attention heads (#{config.num_attention_heads})" end @num_attention_heads = config.num_attention_heads @attention_head_size = (config.hidden_size / config.num_attention_heads).to_i @all_head_size = @num_attention_heads * @attention_head_size @q = Torch::NN::Linear.new(config.hidden_size, @all_head_size) @k = Torch::NN::Linear.new(config.hidden_size, @all_head_size) @v = Torch::NN::Linear.new(config.hidden_size, @all_head_size) @o = Torch::NN::Linear.new(config.hidden_size, config.hidden_size) @dropout = Torch::NN::Dropout.new(p: config.attention_probs_dropout_prob) end |
Instance Method Details
#forward(hidden_states, attention_mask: nil, head_mask: nil, position_bias: nil, output_attentions: false, **kwargs) ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# File 'lib/transformers/models/mpnet/modeling_mpnet.rb', line 127 def forward( hidden_states, attention_mask: nil, head_mask: nil, position_bias: nil, output_attentions: false, **kwargs ) q = @q.(hidden_states) k = @k.(hidden_states) v = @v.(hidden_states) q = transpose_for_scores(q) k = transpose_for_scores(k) v = transpose_for_scores(v) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = Torch.matmul(q, k.transpose(-1, -2)) attention_scores = attention_scores / Math.sqrt(@attention_head_size) # Apply relative position embedding (precomputed in MPNetEncoder) if provided. if !position_bias.nil? attention_scores += position_bias end if !attention_mask.nil? attention_scores = attention_scores + attention_mask end # Normalize the attention scores to probabilities. attention_probs = Torch::NN::Functional.softmax(attention_scores, dim: -1) attention_probs = @dropout.(attention_probs) if !head_mask.nil? attention_probs = attention_probs * head_mask end c = Torch.matmul(attention_probs, v) c = c.permute(0, 2, 1, 3).contiguous new_c_shape = c.size[...-2] + [@all_head_size] c = c.view(*new_c_shape) o = @o.(c) outputs = output_attentions ? [o, attention_probs] : [o] outputs end |
#transpose_for_scores(x) ⇒ Object
121 122 123 124 125 |
# File 'lib/transformers/models/mpnet/modeling_mpnet.rb', line 121 def transpose_for_scores(x) new_x_shape = x.size[...-1] + [@num_attention_heads, @attention_head_size] x = x.view(*new_x_shape) x.permute(0, 2, 1, 3) end |