Module: Transformers::ModuleUtilsMixin
- Included in:
- PreTrainedModel
- Defined in:
- lib/transformers/modeling_utils.rb
Instance Method Summary collapse
- #device ⇒ Object
- #get_extended_attention_mask(attention_mask, input_shape, device: nil, dtype: nil) ⇒ Object
- #get_head_mask(head_mask, num_hidden_layers, is_attention_chunked: false) ⇒ Object
Instance Method Details
#device ⇒ Object
59 60 61 |
# File 'lib/transformers/modeling_utils.rb', line 59 def device get_parameter_device(self) end |
#get_extended_attention_mask(attention_mask, input_shape, device: nil, dtype: nil) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/transformers/modeling_utils.rb', line 63 def get_extended_attention_mask( attention_mask, input_shape, device: nil, dtype: nil ) if dtype.nil? dtype = @dtype end if !(attention_mask.dim == 2 && @config.is_decoder) # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` if !device.nil? raise Todo end end # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim == 3 raise Todo elsif attention_mask.dim == 2 # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if @config.is_decoder raise Todo else extended_attention_mask = attention_mask[0.., nil, nil, 0..] end else raise Todo end # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and the dtype's smallest value for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype: dtype) # fp16 compatibility # TODO use Torch.finfo extended_attention_mask = (1.0 - extended_attention_mask) * -3.40282e+38 extended_attention_mask end |
#get_head_mask(head_mask, num_hidden_layers, is_attention_chunked: false) ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/transformers/modeling_utils.rb', line 107 def get_head_mask(head_mask, num_hidden_layers, is_attention_chunked: false) if !head_mask.nil? head_mask = _convert_head_mask_to_5d(head_mask, num_hidden_layers) if is_attention_chunked == true head_mask = head_mask.unsqueeze(-1) end else head_mask = [nil] * num_hidden_layers end head_mask end |