Class: Transformers::Vit::ViTSelfAttention
- Inherits:
-
Torch::NN::Module
- Object
- Torch::NN::Module
- Transformers::Vit::ViTSelfAttention
- Defined in:
- lib/transformers/models/vit/modeling_vit.rb
Instance Method Summary collapse
- #forward(hidden_states, head_mask: nil, output_attentions: false) ⇒ Object
-
#initialize(config) ⇒ ViTSelfAttention
constructor
A new instance of ViTSelfAttention.
- #transpose_for_scores(x) ⇒ Object
Constructor Details
#initialize(config) ⇒ ViTSelfAttention
Returns a new instance of ViTSelfAttention.
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
# File 'lib/transformers/models/vit/modeling_vit.rb', line 102 def initialize(config) super() if config.hidden_size % config.num_attention_heads != 0 && !config.instance_variable_defined?(:@embedding_size) raise ArgumentError, "The hidden size #{config.hidden_size} is not a multiple of the number of attention " + "heads #{config.num_attention_heads}." end @num_attention_heads = config.num_attention_heads @attention_head_size = (config.hidden_size / config.num_attention_heads).to_i @all_head_size = @num_attention_heads * @attention_head_size @query = Torch::NN::Linear.new(config.hidden_size, @all_head_size, bias: config.qkv_bias) @key = Torch::NN::Linear.new(config.hidden_size, @all_head_size, bias: config.qkv_bias) @value = Torch::NN::Linear.new(config.hidden_size, @all_head_size, bias: config.qkv_bias) @dropout = Torch::NN::Dropout.new(p: config.attention_probs_dropout_prob) end |
Instance Method Details
#forward(hidden_states, head_mask: nil, output_attentions: false) ⇒ Object
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/transformers/models/vit/modeling_vit.rb', line 127 def forward( hidden_states, head_mask: nil, output_attentions: false ) mixed_query_layer = @query.(hidden_states) key_layer = transpose_for_scores(@key.(hidden_states)) value_layer = transpose_for_scores(@value.(hidden_states)) query_layer = transpose_for_scores(mixed_query_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = Torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / Math.sqrt(@attention_head_size) # Normalize the attention scores to probabilities. attention_probs = Torch::NN::Functional.softmax(attention_scores, dim: -1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = @dropout.(attention_probs) # Mask heads if we want to if !head_mask.nil? attention_probs = attention_probs * head_mask end context_layer = Torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous new_context_layer_shape = context_layer.size[...-2] + [@all_head_size] context_layer = context_layer.view(new_context_layer_shape) outputs = output_attentions ? [context_layer, attention_probs] : [context_layer] outputs end |
#transpose_for_scores(x) ⇒ Object
121 122 123 124 125 |
# File 'lib/transformers/models/vit/modeling_vit.rb', line 121 def transpose_for_scores(x) new_x_shape = x.size[...-1] + [@num_attention_heads, @attention_head_size] x = x.view(new_x_shape) x.permute(0, 2, 1, 3) end |