
    d                       d dl Z d dlZd dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ ddlmZ ddlmZmZ ddlmZmZmZmZmZ dd	lmZm Z m!Z! d
dl"m#Z#m$Z$ d
dl%m&Z&m'Z'm(Z( d
dl)m*Z* d
dl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE d
dlFmGZGmHZHmIZImJZJmKZK erddlLmMZM d
dlNmOZO  e!jP        eQ          ZRe G d de                       ZSe G d de                       ZTe G d de                       ZUe G d de                       ZVe G d de                       ZWe G d de                       ZXe G d de                       ZYe G d  d!e                       ZZe G d" d#e                       Z[e G d$ d%e                       Z\eeVeSf         Z]eeXeWf         Z^eeZeYf         Z_ee\e[f         Z`eeTeUf         Zaee]e^e_e`eaf         Zb G d& d'e          Zc G d( d)          Zdd* Zed<d,Zfd d- egd.           d
fd/ejh        d0eid1egd2egd3eid4ejh        fd5Zjd6ejh        d7ejh        d8ejh        d9egd:eid4ejh        fd;ZkdS )=    N)	dataclass)TYPE_CHECKINGAnyCallableDictListOptionalTupleUnion)nn   )is_deepspeed_zero3_enabled)CausalLMOutputWithPastSeq2SeqLMOutput)'MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPINGMODEL_FOR_CAUSAL_LM_MAPPING&MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPINGMODEL_FOR_VISION_2_SEQ_MAPPING)ExplicitEnumModelOutputlogging   )DisjunctiveConstraintPhrasalConstraint)
BeamScorerBeamSearchScorerConstrainedBeamSearchScorer)GenerationConfig)#EncoderNoRepeatNGramLogitsProcessor'EncoderRepetitionPenaltyLogitsProcessorEpsilonLogitsWarperEtaLogitsWarperExponentialDecayLengthPenaltyForcedBOSTokenLogitsProcessorForcedEOSTokenLogitsProcessorForceTokensLogitsProcessorHammingDiversityLogitsProcessorInfNanRemoveLogitsProcessorLogitNormalizationLogitsProcessorListMinLengthLogitsProcessor!MinNewTokensLengthLogitsProcessorNoBadWordsLogitsProcessorNoRepeatNGramLogitsProcessor PrefixConstrainedLogitsProcessor RepetitionPenaltyLogitsProcessorSequenceBiasLogitsProcessor$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperTypicalLogitsWarper.UnbatchedClassifierFreeGuidanceLogitsProcessor)MaxLengthCriteriaMaxTimeCriteriaStoppingCriteriaStoppingCriteriaListvalidate_stopping_criteria)PreTrainedModel)BaseStreamerc                       e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed<   dS )GreedySearchDecoderOnlyOutputa-  
    Base class for outputs of decoder-only generation models using greedy search.


    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    N	sequencesscores
attentionshidden_states__name__
__module____qualname____doc__rC   torch
LongTensor__annotations__rD   r	   r
   FloatTensorrE   rF        =lib/python3.11/site-packages/transformers/generation/utils.pyrB   rB   T   s          ( #'Iu&&&15FHU5,-.555<@JuU%6789@@@?CM8E%(9":;<CCCCCrQ   rB   c                   p   e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed	<   dS )
%ContrastiveSearchEncoderDecoderOutputaP
  
    Base class for outputs of decoder-only generation models using contrastive search.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    NrC   rD   encoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesrH   rI   rJ   rK   rC   rL   rM   rN   rD   r	   r
   rO   rU   rV   rW   rX   rY   rP   rQ   rR   rT   rT   p   s          8 #'Iu&&&15FHU5,-.555=Au'8!9:AAA@D8E%*;$<=DDDDHuU->'?!@AHHHBFhuU5+<%=>?FFFGK8E%0A*B$CDKKKKKrQ   rT   c                       e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed<   dS )"ContrastiveSearchDecoderOnlyOutputaA  
    Base class for outputs of decoder-only generation models using contrastive search.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when
        `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is
        passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    NrC   rD   rE   rF   rG   rP   rQ   rR   r\   r\                * #'Iu&&&15FHU5,-.555<@JuU%6789@@@?CM8E%(9":;<CCCCCrQ   r\   c                   p   e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed	<   dS )
 GreedySearchEncoderDecoderOutputaH  
    Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention
    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)


    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
    NrC   rD   rU   rV   rW   rX   rY   rZ   rP   rQ   rR   r_   r_      s          > #'Iu&&&15FHU5,-.555=Au'8!9:AAA@D8E%*;$<=DDDDHuU->'?!@AHHHBFhuU5+<%=>?FFFGK8E%0A*B$CDKKKKKrQ   r_   c                       e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed<   dS )SampleDecoderOnlyOutputa  
    Base class for outputs of decoder-only generation models using sampling.


    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length,
            sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
    NrC   rD   rE   rF   rG   rP   rQ   rR   ra   ra      r]   rQ   ra   c                   p   e Zd ZU dZdZej        ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZe	e
ej                          ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed<   dZe	e
e
ej                                   ed	<   dS )
SampleEncoderDecoderOutputa  
    Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of
    the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)


    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape
            `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length,
            sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
    NrC   rD   rU   rV   rW   rX   rY   rZ   rP   rQ   rR   rc   rc      s          @ #'Iu&&&15FHU5,-.555=Au'8!9:AAA@D8E%*;$<=DDDDHuU->'?!@AHHHBFhuU5+<%=>?FFFGK8E%0A*B$CDKKKKKrQ   rc   c                      e Zd ZU dZdZej        ed<   dZe	ej
                 ed<   dZe	eej
                          ed<   dZe	ej                 ed<   dZe	eeej
                                   ed<   dZe	eeej
                                   ed<   dS )	BeamSearchDecoderOnlyOutputa	  
    Base class for outputs of decoder-only generation models using beam search.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
    NrC   sequences_scoresrD   beam_indicesrE   rF   rH   rI   rJ   rK   rC   rL   rM   rN   rf   r	   rO   rD   r
   rg   rE   rF   rP   rQ   rR   re   re   &            2 #'Iu&&&48hu0188815FHU5,-.555/3L(5+,333<@JuU%6789@@@?CM8E%(9":;<CCCCCrQ   re   c                      e Zd ZU dZdZej        ed<   dZe	ej
                 ed<   dZe	eej
                          ed<   dZe	ej                 ed<   dZe	eej
                          ed<   dZe	eej
                          ed<   dZe	eeej
                                   ed	<   dZe	eeej
                                   ed
<   dZe	eeej
                                   ed<   dS )BeamSearchEncoderDecoderOutputaM  
    Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights
    of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
            sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
    NrC   rf   rD   rg   rU   rV   rW   rX   rY   rH   rI   rJ   rK   rC   rL   rM   rN   rf   r	   rO   rD   r
   rg   rU   rV   rW   rX   rY   rP   rQ   rR   rk   rk   I  s%        # #J #'Iu&&&48hu0188815FHU5,-.555/3L(5+,333=Au'8!9:AAA@D8E%*;$<=DDDDHuU->'?!@AHHHBFhuU5+<%=>?FFFGK8E%0A*B$CDKKKKKrQ   rk   c                      e Zd ZU dZdZej        ed<   dZe	ej
                 ed<   dZe	eej
                          ed<   dZe	ej                 ed<   dZe	eeej
                                   ed<   dZe	eeej
                                   ed<   dS )	BeamSampleDecoderOnlyOutputa  
    Base class for outputs of decoder-only generation models using beam sample.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
    NrC   rf   rD   rg   rE   rF   rh   rP   rQ   rR   rn   rn   {  ri   rQ   rn   c                      e Zd ZU dZdZej        ed<   dZe	ej
                 ed<   dZe	eej
                          ed<   dZe	ej                 ed<   dZe	eej
                          ed<   dZe	eej
                          ed<   dZe	eeej
                                   ed	<   dZe	eeej
                                   ed
<   dZe	eeej
                                   ed<   dS )BeamSampleEncoderDecoderOutputa  
    Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention
    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`).
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size*num_beams, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
    NrC   rf   rD   rg   rU   rV   rW   rX   rY   rl   rP   rQ   rR   rp   rp     s%        " "H #'Iu&&&48hu0188815FHU5,-.555/3L(5+,333=Au'8!9:AAA@D8E%*;$<=DDDDHuU->'?!@AHHHBFhuU5+<%=>?FFFGK8E%0A*B$CDKKKKKrQ   rp   c                   2    e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
S )GenerationModezg
    Possible generation modes, downstream of the [`~generation.GenerationMixin.generate`] method.
    contrastive_searchgreedy_searchsampleassisted_generationbeam_searchbeam_sampleconstrained_beam_searchgroup_beam_searchN)rH   rI   rJ   rK   CONTRASTIVE_SEARCHGREEDY_SEARCHSAMPLEASSISTED_GENERATIONBEAM_SEARCHBEAM_SAMPLECONSTRAINED_BEAM_SEARCHGROUP_BEAM_SEARCHrP   rQ   rR   rr   rr     sH         
 .#MF/KK7+rQ   rr   c            $          e Zd ZdZd Z	 	 	 dRdeej                 dee         dee	e
ej        f                  deej        ee
         e	e
ej        f         f         fdZ	 	 	 dRdeej                 dee         dee	e
ej        f                  dej        fd	Zdej        d
ee         deeeee         f                  dej        fdZ	 dSdej        dee
         de	e
ef         fdZ	 	 	 dRdede
de	e
ej        f         dededej        deej        e	e
ej        f         f         fdZdTdededefdZe	 	 	 dUdededeej                 deej        e	e
ef         f         fd            ZdVdedefdZ	 	 dWdede	e
ef         dedede	e
ef         f
dZd Zd edefd!Z d ed"ed#         de!fd$Z"	 	 	 dRd ed%ed&ej        d'e#eej        gee         f         d(ee         dee	e
ef                  d)eej                 d*eej                 defd+Z$d ed,ee%         de%fd-Z&d.eee%f         d/eee%f         deee%f         fd0Z'	 	 dXd1ej        d2eej                 d3eej                 d4edej        f
d5Z(d6 Z)de	e
ef         fd7Z*d8 Z+ ej,                    	 	 	 	 	 	 	 	 	 	 dYdeej                 d ee         d(ee         d,ee%         d'ee#eej        gee         f                  d9ee         d"ed#         d:ed;         d)eej                 d*eej                 dee-ej        f         fd<            Z. ej,                    	 	 	 	 	 	 	 	 	 	 	 	 	 	 dZdej        d>ee         d?ee/         d(ee         d@ee         d,ee%         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9ed:ed;         dEee         dee0ej        f         f dF            Z1	 	 	 	 	 	 	 	 	 	 	 d[dej        d(ee         d,ee%         dGee         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9ed:ed;         dee2ej        f         fdHZ3	 	 	 	 	 	 	 	 	 	 	 	 d\dej        d(ee         d,ee%         d@ee         dGee         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9ed:ed;         dee4ej        f         fdIZ5	 	 	 	 	 	 	 	 	 	 d]dej        dJe6d(ee         d,ee%         dGee         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9edee7ej        f         fdKZ8	 	 	 	 	 	 	 	 	 	 	 d^dej        dJe6d(ee         d,ee%         d@ee         dGee         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9edee9ej        f         fdLZ:	 	 	 	 	 	 	 	 	 	 d]dej        dJe6d(ee         d,ee%         dGee         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9efdMZ;	 	 	 	 	 	 	 	 	 	 dYdej        dNe<d(ee         d,ee%         dGee         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9ee         dee7ej        f         fdOZ=	 	 	 	 	 	 	 	 	 	 	 	 d_dej        d"d#dPed(ee         d@ee         d,ee%         d
ee         deeeee         f                  dAee         dBee         dCee         dDee         d9ed:ed;         fdQZ>dS )`GenerationMixina  
    A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`].

    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* by calling [`~generation.GenerationMixin.greedy_search`] if `num_beams=1` and
          `do_sample=False`
        - *contrastive search* by calling [`~generation.GenerationMixin.contrastive_search`] if `penalty_alpha>0` and
          `top_k>1`
        - *multinomial sampling* by calling [`~generation.GenerationMixin.sample`] if `num_beams=1` and
          `do_sample=True`
        - *beam-search decoding* by calling [`~generation.GenerationMixin.beam_search`] if `num_beams>1` and
          `do_sample=False`
        - *beam-search multinomial sampling* by calling [`~generation.GenerationMixin.beam_sample`] if `num_beams>1`
          and `do_sample=True`
        - *diverse beam-search decoding* by calling [`~generation.GenerationMixin.group_beam_search`], if `num_beams>1`
          and `num_beam_groups>1`
        - *constrained beam-search decoding* by calling [`~generation.GenerationMixin.constrained_beam_search`], if
          `constraints!=None` or `force_words_ids!=None`

    You do not need to call any of the above methods directly. Pass custom parameter values to 'generate' instead. To
    learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    c                      t          d          )NzeA model class needs to define a `prepare_inputs_for_generation` method in order to use `.generate()`.)NotImplementedError)selfargskwargss      rR   prepare_inputs_for_generationz-GenerationMixin.prepare_inputs_for_generation   s    !s
 
 	
rQ   Ninputsbos_token_idmodel_kwargsreturnc                    | j         j        r2t          | d          r"| j        j        | j        k    r| j        j        n| j        fd|                                D             }|                    d          }||t          d| d d| d d	          ||}d	k    rd
|v r| j         j        szd
t          t          j
        | j                  j                                                  v }|st          d| j        j         d          |                     |||          |d	<   n|t          d          |d
         d
c}|                     |||          }||fS )zT
        This function extracts the model-specific `inputs` for generation.
        encoderc                 ,    i | ]\  }}||k    ||S NrP   ).0kv
input_names      rR   
<dictcomp>z9GenerationMixin._prepare_model_inputs.<locals>.<dictcomp>  s/    bbbARSWaRaRa1RaRaRarQ   Nz
`inputs`: z` were passed alongside z/ which is not allowed.Make sure to either pass z or z=...	input_idsinputs_embedszAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)r   zMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)configis_encoder_decoderhasattrr   main_input_nameitemspop
ValueErrorsetinspect	signaturer   
parameterskeys	__class__rH   *_maybe_initialize_input_ids_for_generation)r   r   r   r   inputs_kwarghas_inputs_embeds_forwardingr   s         @rR   _prepare_model_inputsz%GenerationMixin._prepare_model_inputs  s    K*	.i((	. ,0DDD5JJ-Jbbbb););)=)=bbb $''
D99#(:IV I IZ I I,2I I8BI I I   %!F $$L)H)H;1 v/>#%d&HIITYY[[C C 0, 4 $x\`\j\s x x x   -1,[,[L| -\ - -[)) %$%tuuu!-o!>FJ @@Wcddz<//rQ   c                    ||S |                     d          }| j        j        rL|J|j                                        dd         }t          j        |t
          j        | j                  dz  S |t          d          d}|
                                D ]+}t          |t
          j                  r|j        d         } n,t          j        |dft
          j        | j                  |z  S )	z3Initializes input ids for generation, if necessary.Nencoder_outputsdtypedeviceizB`bos_token_id` has to be defined when no `input_ids` are provided.r   r   )getr   r   last_hidden_statesizerL   oneslongr   r   values
isinstanceTensorshape)r   r   r   r   r   r   
batch_sizevalues           rR   r   z:GenerationMixin._maybe_initialize_input_ids_for_generationE  s     M&**+<==;) 	Ro.I#5::<<SbSAE:e5:dkJJJTQQabbb 
!((** 	 	E%.. "[^
 z:q/DKPPPS___rQ   pad_token_ideos_token_idc                    t          |j                  dk    o|j        t          j        t          j        fv }|d uo||v }t          |t                    r|g}|d u p||v}|r+|r)|r'|                    |                                          S t          j        |j        d d         t          j        |j	                  S )Nr   r   )
lenr   r   rL   intr   r   ner   r   )r   r   r   r   is_input_idsis_pad_token_in_inputs&is_pad_token_not_equal_to_eos_token_ids          rR   &_prepare_attention_mask_for_generationz6GenerationMixin._prepare_attention_mask_for_generationa  s     6<((A-Y&,59ejBY2Y".d":!XQWAWlC(( 	*(>L2>$2F1mL`lLl.  	X2 	X7] 	X99\**//111:fl2A2.ejWWWWrQ   inputs_tensormodel_input_namec                    |                                  }t          |d          rd|j        _        g dfd|                                D             }t          t          j        |j                  j	                  dv pdv }|s fd|                                D             }||n| j
        }d|d<   |||<    |d
i ||d	<   |S )N_hf_hookT)decoder_
cross_attn	use_cachec                 T    i | ]#\  }t          fd D                        |$S )c              3   B   K   | ]}                     |          V  d S r   )
startswith)r   parguments     rR   	<genexpr>z\GenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>.<genexpr>  s1      II!x**1--IIIIIIrQ   any)r   r   r   irrelevant_prefixs     @rR   r   zRGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>  sT     
 
 
%IIII7HIIIII
e
 
 
rQ   r   r   c                 $    i | ]\  }}|v 	||S rP   rP   )r   r   r   encoder_signatures      rR   r   zRGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>  s2       $3HeQY]nQnQn%QnQnQnrQ   return_dictr   rP   )get_encoderr   r   io_same_devicer   r   r   r   forwardr   r   )	r   r   r   r   r   encoder_kwargsencoder_accepts_wildcardr   r   s	          @@rR   ._prepare_encoder_decoder_kwargs_for_generationz>GenerationMixin._prepare_encoder_decoder_kwargs_for_generations  s:    ""$$ 7J'' 	3.2G+ DCC
 
 
 
#/#5#5#7#7
 
 

   1'/ B B MNN#+/@#@#gNVgDg ' 	   7E7K7K7M7M  N
 0@/K++QUQe(,}%+8'(7>w7P7P7P7P&'rQ   r   decoder_start_token_idr   c                    |d|v r|                     d          }n"d|v r|dk    r|                     d          }nd}|                     ||          }|| j        }t          j        |dft          j        |          |z  }||}n| j        j        dk    rd| j        	                                v rn|dddf         |k    
                                                                r_t          j        ||gd	
          }d|v rC|d         }	t          j        t          j        |	          ddddf         |	fd	
          }	|	|d<   ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNdecoder_input_idsr   r   r   zvision-encoder-decoderdonutr   r   dimdecoder_attention_mask)r   _get_decoder_start_token_idr   rL   r   r   r   
model_typename_or_pathlowerallitemcat	ones_like)
r   r   r   r   r   r   r   r   decoder_input_ids_startr   s
             rR   )_prepare_decoder_input_ids_for_generationz9GenerationMixin._prepare_decoder_input_ids_for_generation  s    #(;|(K(K , 0 01D E EL((-=-L-L , 0 0 = = $ "&!A!ABXZf!g!g>[F"'*j!_EJW]"^"^"^aw"w $ 7[#'???GtO`OfOfOhOhDhDh  1%)??DDFFKKMM 	P %	+BDU*V\^ _ _ _'<77)56N)O&)._%;<<QQQUCE[\* * *& :P56 ,..rQ   c                 p    ||n| j         j        }||n| j         j        }||S ||S t          d          )Nz\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.)generation_configr   r   r   )r   r   r   s      rR   r   z+GenerationMixin._get_decoder_start_token_id  sc     &1 #"'> 	
 (4'?||TE[Eh!-))%j
 
 	
rQ   r   Fexpand_sizer   r   c                       fd}||                      d          } ||          }|r8|                    d          t          d           ||d                   |d<   ||fS )zIExpands tensors from [batch_size, ...] to [batch_size * expand_size, ...]c                     | D ]J}| |         @t          | |         t          j                  r | |                             d          | |<   K| S )Nr   r   )r   rL   r   repeat_interleave)dict_to_expandkeyr   s     rR   _expand_dict_for_generationzRGenerationMixin._expand_inputs_for_generation.<locals>._expand_dict_for_generation  sc    % d d!#&2z.QTBUW\Wc7d7d2*8*=*O*OP[ab*O*c*cN3'!!rQ   Nr   r   r   zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)r   r   r   )r   r   r   r   r   s   `    rR   _expand_inputs_for_generationz-GenerationMixin._expand_inputs_for_generation  s    	" 	" 	" 	" 	"  !33KQ3GGI22<@@ 	k 122: !pqqq.I.I,WhJi.j.jL*+,&&rQ   outputsstandardize_cache_formatc                     d }d|v r|j         }nd|v r|j        }nd|v r|j        }|r9t          | d          r)|j        j        d         }|                     ||          }|S )Npast_key_valuesmemspast_buckets_states_convert_to_standard_cacher   )r   )r   r   r   r   logitsr   r   )r   r   r   r   r   s        rR   _extract_past_from_model_outputz/GenerationMixin._extract_past_from_model_output  s    ''%5OOw%lOO"g--%9O $ 	f6R(S(S 	f -a0J"==oZd=eeOrQ   c                    |                      ||          |d<   t          |dd           
|j        |d<   d|v r@|d         }t          j        ||d d df                             d          gd          |d<   |sHd|v rC|d         }t          j        ||                    |j        d         d	f          gd          |d<   nGd
|v rC|d
         }t          j        ||                    |j        d         d	f          gd          |d
<   |S )Nr   r   statetoken_type_idsr   r   attention_maskr   r   r   )r  getattrr  rL   r   	unsqueezenew_onesr   )r   r   r   r   r   r  r  r   s           rR   #_update_model_kwargs_for_generationz3GenerationMixin._update_model_kwargs_for_generation  sn    +/*N*N.F +O +
 +
&' 7GT**6$+ML! |++)*:;N-2YWXWXWXZ\W\H]HgHghjHkHk7lrt-u-u-uL)*! 	<//!-.>!?16#^%<%<n>RST>UWX=Y%Z%Z[ac2 2 2-.
 (<77)56N)O&9>+-C-L-LNdNjklNmopMq-r-rs: : :56
 rQ   c                 J    t          d| j        j         d| j                   )NzGMake sure that a `_reorder_cache` function is correctly implemented in z to enable beam search for )r   r   rI   )r   r   beam_idxs      rR   _reorder_cachezGenerationMixin._reorder_cache  s;    !8VZVdVo 8 8'+~8 8
 
 	
rQ   r   c                 |   t                      }|j        2|j        dk    r'|                    t          |j                             |j        dk    rdnd}|j        4|j        dk    r)|                    t          |j        |                     |j        4|j        dk     r)|                    t          |j        |                     |j	        4|j	        dk     r)|                    t          |j	        |                     |j        >d	|j        cxk     rdk     r,n n)|                    t          |j        |
                     |j        >d	|j        cxk     rdk     r,n n)|                    t          |j        |
                     |j        du r!|                    t!                                 |S )z
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances
        used for multinomial sampling.
        N      ?r   r   r   )top_kmin_tokens_to_keep)top_pr  )massr          )epsilonr  T)r+   temperatureappendr5   	num_beamsr  r6   r  r7   	typical_pr8   epsilon_cutoffr"   
eta_cutoffr#   renormalize_logitsr*   )r   r   warpersr  s       rR   _get_logits_warperz"GenerationMixin._get_logits_warper%  s     &'' (49J9VZ]9]9]NN23D3PQQRRR"3"="A"AQQq".3D3Ja3O3ONN+2C2I^pqqqrrr".3D3JS3P3PNN+2C2I^pqqqrrr&27H7RUX7X7XNN#):)DYklll   +7CBSBb<h<h<h<heh<h<h<h<h<hNN#,=,Lasttt   '3>O>Z8`8`8`8`]`8`8`8`8`8`NN(9(DYklll   /477NN-//000rQ   assistant_modelr?   c                    |j         |j        t          j        }n|j        dk    rT|j        du r>|j        *|j        dk    r|j        |j        dk    rt          j        }nTt          j	        }nGt          j
        }n:|j        dk    rt          j        }n"|j        du rt          j        }nt          j        }| |dv rt          j        }nt!          d          |S )z[
        Returns the generation mode triggered by a [`GenerationConfig`] instance.
        Nr   Fr   T)rt   ru   zYou've set `assistant_model`, which triggers assisted generate. Currently, assisted generate is only supported with Greedy Search and Sample.)constraintsforce_words_idsrr   r   r  	do_sampler  penalty_alphar{   r|   r}   num_beam_groupsr   r   r   r~   r   )r   r   r   generation_modes       rR   _get_generation_modez$GenerationMixin._get_generation_modeK  s     (48I8Y8e,DOO(A-- *e33%+7)/!33)7C)7!;;&4&GOO&4&BOO"0"7 0144"0"B",44"0"<"0"< &"==="0"D G   rQ   input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnlogits_processornegative_prompt_idsnegative_prompt_attention_maskc	           
       	   t                      }	|j        =|j        dk    r2|	                    t          |j        | |||d                              |j        (|	                    t          |j                             |j        ?|j        dk    r4|	                    t          |j        |j        |j	                             |j
        4|j
        dk    r)|	                    t          |j
        |	                     |j        3|j        dk    r(|	                    t          |j        
                     |j        2|j        dk    r'|	                    t          |j                             |j        O|j        dk    rD| j        j        r)|	                    t'          |j        |                     nt)          d          |j        -|	                    t-          |j        |j                             |j        ?|j        8|j        dk    r-|	                    t3          |j        |j                             |j        @|j        9|j        dk    r.|	                    t7          ||j        |j                             |0|	                    t9          ||j        |j	        z                       |j        '|	                    t=          |j                             |j        -|	                    tA          |j!        |j                             |j"        du r!|	                    tG                                 |j$        .|	                    tK          |j$        |j        |                     |j&        '|	                    tO          |j&                             |j(        [|}
|dk    s|j        |
n|
dz   }
|j)        |
|j)        d         d         z  }
|	                    tU          |j(        |
                     |j)        '|	                    tW          |j)                             | ,                    |	|          }	|j-        du r!|	                    t]                                 |	S )z
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
        instances used to modify the scores of the language model head.
        Nr   r   )unconditional_idsunconditional_attention_maskr   )sequence_biasr  )diversity_penaltyr  r&  r  )penaltyr*  )r4  r   zTIt's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architectureTr   )/r+   guidance_scaler  r9   r2  r2   r3  r(   r  r&  encoder_repetition_penaltyr!   repetition_penaltyr1   no_repeat_ngram_sizer/   encoder_no_repeat_ngram_sizer   r   r    r   bad_words_idsr.   r   
min_lengthr,   min_new_tokensr-   r0   forced_bos_token_idr%   forced_eos_token_idr&   
max_lengthremove_invalid_valuesr)    exponential_decay_length_penaltyr$   suppress_tokensr4   begin_suppress_tokensforced_decoder_idsr3   r'   _merge_criteria_processor_listr  r*   )r   r   r)  r*  r+  r,  r   r-  r.  
processorsbegin_indexs              rR   _get_logits_processorz%GenerationMixin._get_logits_processors  s     )**
+7<M<\`a<a<a>%4&91O*;7     *69HYHghhhiii.:?P?beh?h?h/&7&I/9$5$E     8D!<CC7-H\m    
 /;@Q@dhk@k@k>GXGklllmmm1=BSBhklBlBl:;L;abbccc:F!>BB{- 	!!7)FHY     !j   *6)*;*IK\Kijj   (4!.:!,q0067H7SUfUsttuuu,8!.:!01441(*;*JL]Lj   
 $/0,.?.IM^Mn.n   
 0<;<M<abbccc0<-.?.JL]Lqrr   2d::9;;<<<=I-%F%2(    ,8;<M<]^^___2>.K )1,,0A0U0]  1_ 
 !3?0CBGJJ45F5\^ijj   /;89J9]^^___88EUVV
/477022333rQ   stopping_criteriac                 8   t                      }|j        ?t          | j        dd           }|                    t          |j        |                     |j        (|                    t          |j                             |                     ||          }|S )Nmax_position_embeddings)r?  rK  )max_time)	r=   r?  r  r   r  r:   rL  r;   rE  )r   r   rI  criteriarK  s        rR   _get_stopping_criteriaz&GenerationMixin._get_stopping_criteria  s     ())'3&-dk;TVZ&[&[#OO!0;,C     %1OOO5F5OPPPQQQ66xARSSrQ   default_listcustom_listc                 @   t          |          dk    r|S |D ]p}|D ]k}t          |          t          |          u rKt          |t                    rdnd}t	          d| dt          |           d| d| d| d	| d
| d          lq|                    |           |S )Nr   zstopping criteriazlogits processorz	A custom z	 of type z with values zS has been passed to `.generate()`, but it has already been created with the values z. z has been created by passing the corresponding arguments to generate or by the model's config default values. If you just want to change the default values of zO consider passing them as arguments to `.generate()` instead of using a custom .)r   typer   r<   r   extend)r   rO  rP  defaultcustomobject_types         rR   rE  z.GenerationMixin._merge_criteria_processor_list  s'   
 {q  # 
	 
	G% 	 	<<4==009CFL\9]9]"u"5"5cuK$hK h h$v,, h hU[ h h[bh hfmh h Vah h Ze	h h h   1	 	K(((rQ   rC   rD   rg   normalize_logitsc                    |tt          j        |d         j        d                                       dd                              |j                  }|                    dt          |                    }t          j        |          	                    t          |          d          
                    dd          }|rs|	                    d| j        j        |j        d                   }t           j        j                            |d          }|	                    d|j        d                   }|dk     }d|                                z
                      d                                          }|                                ddd|f         }|ddd|f         }d||<   || j        j        z  }|j        d         |z
  }|dd|df         |z   }	|                    d|	          }
d|
|<   |
S )a"  
        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
        used). This is a convenient method to quicky obtain the scores of the selected tokens at generation time.

        Parameters:
            sequences (`torch.LongTensor`):
                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
                shorter if all batches finished early due to the `eos_token_id`.
            scores (`tuple(torch.FloatTensor)`):
                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
                of log probabilities of tokens conditioned on log softmax of previously generated tokens Tuple of
                `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token), with
                each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
            beam_indices (`torch.LongTensor`, *optional*):
                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
                generate-time.
            normalize_logits (`bool`, *optional*, defaults to `False`):
                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).

        Return:
            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
                the transition scores (logits)

        Examples:

        ```python
        >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
        >>> import numpy as np

        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
        >>> inputs = tokenizer(["Today is"], return_tensors="pt")

        >>> # Example 1: Print the scores for each token generated with Greedy Search
        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, normalize_logits=True
        ... )
        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
        >>> # encoder-decoder models, like BART or T5.
        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
        >>> generated_tokens = outputs.sequences[:, input_length:]
        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
        ...     # | token | token string | logits | probability
        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
        |   262 |  the     | -1.414 | 24.33%
        |  1110 |  day     | -2.609 | 7.36%
        |   618 |  when    | -2.010 | 13.40%
        |   356 |  we      | -1.859 | 15.58%
        |   460 |  can     | -2.508 | 8.14%

        >>> # Example 2: Reconstruct the sequence scores from Beam Search
        >>> outputs = model.generate(
        ...     **inputs,
        ...     max_new_tokens=5,
        ...     num_beams=4,
        ...     num_return_sequences=4,
        ...     return_dict_in_generate=True,
        ...     output_scores=True,
        ... )
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
        ... )
        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
        >>> # Tip: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
        >>> # use case, you might want to recompute it with `normalize_logits=True`.
        >>> output_length = input_length + np.sum(transition_scores.numpy() < 0, axis=1)
        >>> length_penalty = model.generation_config.length_penalty
        >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
        True
        ```Nr   r   r   r   )rL   aranger   viewtor   expandr   stackreshape	transposer   
vocab_sizer   
functionallog_softmaxr   summaxclonegather)r   rC   rD   rg   rX  beam_indices_maskmax_beam_lengthbeam_sequence_indicescut_idxindicestransition_scoress              rR   compute_transition_scoresz)GenerationMixin.compute_transition_scores  s   f  <q	(:;;@@QGGJJ9K[\\L'..r3v;;??L V$$,,S[["==GG1MM  	:^^B(>R@PQQFX(44V4CCF^^BR(899F )1,055777<<R@@DDFF#))++AAA/?/?,?@-aaa1A/1A.AB +,&' !-t{/E E /"%7AAAwxxK(+@@ #MM!W55 01+,  rQ   c                 p   |                                  st          t          t          t          t
          g}t                      }|D ]G}|                    t          | j	                  d          }||
                    |j                   Hd| j        j         d}|r|d| z  }t          |          dS )z
        Confirms that the model class is compatible with generation. If not, raises an exception that points to the
        right class to use.
        N)rU  zThe current model class (zQ) is not compatible with `.generate()`, as it doesn't have a language model head.z2 Please use one of the following classes instead: )can_generater   r   r   r   r   r   r   rS  r   addrH   r   	TypeError)r   generate_compatible_mappingsgenerate_compatible_classesmodel_mappingsupported_modelsexception_messages         rR   _validate_model_classz%GenerationMixin._validate_model_class  s    
   "" 	/+7.62,( +.%%'!= O O#0#4#4T$+5F5FPT#4#U#U #//334D4MNNN9DN,C 9 9 9  + x!%wZu%w%ww!-...'	/ 	/rQ   c                 f   | j         j        rdD ]}|                    |d           g }t          t	          j        | j                  j                  }d|v sd|v r.|t          t	          j        | j                  j                  z  }| j         j        rt          | | j
        d          }t          | dd          }||t          |dd          }|0t          t	          j        |j                  j                  }||z  }t          | dd          }||t          |dd          }|:t          t	          j        |j                  j                  }	|d |	D             z  }|                                D ] \  }}
|
||vr|                    |           !|rt          d| d	          dS )
zXValidates model kwargs for generation. Generate argument typos will also be caught here.)r   Nr   r   r   decoderc                     h | ]}d | S )r   rP   )r   xs     rR   	<setcomp>z9GenerationMixin._validate_model_kwargs.<locals>.<setcomp>  s    JJJ!~!~~JJJrQ   z8The following `model_kwargs` are not used by the model: zG (note: typos in the generate arguments will also show up in this list))r   r   r   r   r   r   r   r   r   r  base_model_prefixr   r  r   )r   r   r   unused_model_args
model_args
base_modelr   encoder_model_argsrz  decoder_model_argsr   s              rR   _validate_model_kwargsz&GenerationMixin._validate_model_kwargs  s    ;) 	,, , ,  d++++*4+MNNYZZ
 z!!^z%A%A#g/==HIIIJ ;) 	K t'=tDDJ dIt44G :#9!*i>>"%():7?)K)K)V%W%W"00
 dIt44G:#9!*i>>"%():7?)K)K)V%W%W"JJ7IJJJJ
&,,.. 	. 	.JC S
%:%:!((--- 	FK\ F F F  	 	rQ   c           	      L   |r5|j         .|j        dk    r#t          j        d|j         dt                     ||j        k    r9| j        j        rdnd}t          j        d| d| d	|j         d
t                     d}|r|d|j         dz  }|j        >|j        |j        k    r.t          j        d|j         d|j         d|z   t                     |j        H|j        |z   }||j        k    r5t          j        d|j         d| d|j         d|z   t                     dS dS dS )z=Performs validation related to the resulting generated lengthN   z0Using the model-agnostic default `max_length` (=zy) to control thegeneration length. We recommend setting `max_new_tokens` to control the maximum length of the generation.r   r   zInput length of z is z, but `max_length` is set to zX. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.z Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.z" Note that `max_length` is set to z, its default value.z-Unfeasible length constraints: `min_length` (z.) is larger than the maximum possible length (z).z1Unfeasible length constraints: `min_new_tokens` (z$), when added to the prompt length (z/), is larger than the maximum possible length ()	max_new_tokensr?  warningswarnUserWarningr   r   r;  r<  )r   r   input_ids_lengthhas_default_max_lengthinput_ids_stringmin_length_error_suffixr;  s          rR   _validate_generated_lengthz*GenerationMixin._validate_generated_length  s    " 	&7&F&NSdSosuSuSuMCTC_    	   0;;;6:k6Te22ZeM0#3 0 09I 0 0%00 0 0 	  + 	  " 	#g5F5Qggg# '38I8TWhWs8s8sMR@Q@\ R R1B1MR R RTkl  
 +7*9<LLJ-888VHYHh V V3CV V5F5QV V VXop  	     8788rQ   synced_gpusstreamerr@   c                    |*t                      rt          j                    dk    rd}nd}|                                  R| j        j        r?t          j        | j                  }|| j        k    rt          j
        d           || _        | j        t          j                   j        d5i |}                                 |                     |                                           ||nt!                      }||nt#                      }j        j        y|                    dd          t*                              d           j        }t/          |t0                    r|d         }t*                              d	| d
           |_        |                     |j        |          \  }}}|j        d         }j        |d<   j        |d<   | j        j        s|dk    rd|d<   n
j        |d<   dtA          tC          j"        | j#                  j$        %                                          v }d|v}|                    dd          (|r&|r$| &                    |j        j                  |d<   | j        j        sdj        ]tO          |j                  dk    rEtQ          j)        |dddf         j        k              dk    rt*                              d           | j        j        rd|vr| *                    |||          }| j        j        r.| +                    |||j,        j        |j-                  \  }}n|dk    r|n|.                    d          }|'|/                    |0                                           |j        d         }|                    d          du oj1        du}j2        <|s+t*                              dj2         dj1         d           j2        |z   _1        | 3                    ||           | 4                    |          }|j5        dk    rtm          d          | j-        j7        |j-        j7        k    rBt          j
        d|j-        j7         d| j-        j7         d| j-        j7         dtp                     | 9                    ||||||	|
          }| :                    |          }|tv          j<        k    rj=        dk    rtm          d j=         d!          |dk    rtm          d"          |d         stm          d#          |j        j        r[t          j        |          }|                    ||j        j        |          \  }}}|*                    |||          }|d         |d$<    | j>        |f|j?        |j?        r| @                              nd|j        j        jA        jB        ||d%|S |tv          jC        k    r+ | jD        |f||j        j        jA        jB        ||d&|S |tv          jE        k    rT|d         stm          d'           | jF        |fjG        jH        ||j        j        jA        jB        ||jI        d(|S |tv          jJ        k    rd| @                              } | jK        d5|j=        | j        j        d)|\  }} | jL        |f|||j        j        jA        jB        ||d*	|S |tv          jM        k    rt          |j5        |j-        jO        jP        j=        j1        +          } | jK        d5|j5        | j        j        d)|\  }} | jQ        ||f||j        j        jA        jB        |d,|S |tv          jR        k    r| @                              }t          |j5        |j-        jO        jP        j=        j1        +          } | jK        d5|j5        | j        j        d)|\  }} | jS        ||f|||j        j        jA        jB        |d-|S |tv          jT        k    rt          |j5        |j-        jO        jP        j=        jU        j1        .          } | jK        d5|j5        | j        j        d)|\  }} | jV        ||f||j        j        jA        jB        |d,|S |tv          jW        k    rg }jX        jX        }jY        ifd/}t/          jY        t0                    rtO          jY                  dk    r
 |             jY        D ]}t/          |d         t0                    rt/          |t0                    rtO          |          dk    r
 |             t          d0 |D                       r
 |             t          d1 |D                       r
 |             t          |          }ndt/          |t0                    rtO          |          dk    r
 |             t          d2 |D                       r
 |             t          |          }|]                    |           !t          ||j5        |j-        jO        jP        j=        j1        3          } | jK        d5|j5        | j        j        d)|\  }} | j_        |f|||j        j        jA        jB        |d4|S dS )6a`  

        Generates sequences of token ids for models with a language modeling head.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://arxiv.org/abs/2010.00904).
            synced_gpus (`bool`, *optional*):
                Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
                `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
                generating before other GPUs. Otherwise it'll be set to `False`.
            assistant_model (`PreTrainedModel`, *optional*):
                An assistant model that can be used to accelerate generation. The assistant model must have the exact
                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
                is much faster than running generation with the model you're calling generate from. As such, the
                assistant model should be much smaller.
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
                size. This is an experimental feature, subject to breaking API changes in future versions.
            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Attention_mask for `negative_prompt_ids`.
            kwargs (`Dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.

                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GreedySearchDecoderOnlyOutput`],
                    - [`~generation.SampleDecoderOnlyOutput`],
                    - [`~generation.BeamSearchDecoderOnlyOutput`],
                    - [`~generation.BeamSampleDecoderOnlyOutput`]

                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GreedySearchEncoderDecoderOutput`],
                    - [`~generation.SampleEncoderDecoderOutput`],
                    - [`~generation.BeamSearchEncoderDecoderOutput`],
                    - [`~generation.BeamSampleEncoderDecoderOutput`]
        Nr   TFa&  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed soon, in a future version. Please use a generation configuration file (see https://huggingface.co/docs/transformers/main_classes/text_generation )r  zThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.r   z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.output_attentionsoutput_hidden_statesr   r   r   r   r   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.)r   r   r   r   r   r   r   r?  zBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)zZ`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1.z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.)r   r)  r*  r+  r,  r   r-  r.  )r   rI  zFnum_return_sequences has to be 1 when doing assisted generate, but is rR  z6assisted generate is only supported for batch_size = 1z+assisted generate requires `use_cache=True`assistant_encoder_outputs)r   r$  r,  logits_warperrI  r   r   output_scoresreturn_dict_in_generater  r  )r,  rI  r   r   r  r  r  r  z,Contrastive search requires `use_cache=True`)r  r%  r,  rI  r   r   r  r  r  r  
sequential)r   r   r   )	r,  r  rI  r   r   r  r  r  r  )r   r  r   length_penaltydo_early_stoppingnum_beam_hyps_to_keepr?  )r,  rI  r   r   r  r  r  )r,  r  rI  r   r   r  r  r  )r   r  r   r  r  r  r&  r?  c                  4    t          d j         d          )Nzn`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]`of positive integers, but is rR  )r   r#  )r   s   rR   	typeerrorz+GenerationMixin.generate.<locals>.typeerror  s3    $]8I8Y] ] ]  rQ   c              3   B   K   | ]}t          |t                     V  d S r   )r   listr   	token_idss     rR   r   z+GenerationMixin.generate.<locals>.<genexpr>  s/      YY9:i#>#>>YYYYYYrQ   c              3   H   K   | ]}t          d  |D                       V  dS )c              3   N   K   | ] }t          |t                     p|d k     V  !dS r   Nr   r   r   token_ids     rR   r   z5GenerationMixin.generate.<locals>.<genexpr>.<genexpr>  s9      jjT\Z#%>%>!>!N(Q,jjjjjjrQ   Nr   r  s     rR   r   z+GenerationMixin.generate.<locals>.<genexpr>  sN         )  jj`ijjjjj     rQ   c              3   N   K   | ] }t          |t                     p|d k     V  !dS r  r  r  s     rR   r   z+GenerationMixin.generate.<locals>.<genexpr>  s9      hhS[Jx$=$= = MAhhhhhhrQ   )r"  r   r  r   r  r  r  r?  )constrained_beam_scorerr,  rI  r   r   r  r  r  rP   )`r   distget_world_sizerx  r   _from_model_configr   from_model_configr   r  r  copydeepcopyupdatevalidater  r+   r=   r   r   r   loggerwarningr   r  r   r   r   r  r  r   r   r   r   r   r   r   r   r   r   rL   rd  r   r   r   r   r   putcpur?  r  r  r(  r  r   rS  r  rH  rN  rr   r~   num_return_sequencesassisted_decodingr$  r  r  r  r|   rt   r{   rs   r  r%  
low_memoryr}   r   ru   r   r   r  early_stoppingrw   r   rx   r   r&  rz   r   r"  r#  r   r   r   r  r   ry   ) r   r   r   r,  rI  r+  r  r   r  r-  r.  r   new_generation_configr   r   r   r   r   accepts_attention_maskrequires_attention_maskr   r  r  r'  assistant_model_kwargsr  beam_scorerfinal_constraintsr  word_ids
constraintr  s      `                             rR   generatezGenerationMixin.generate  s   J )++ $0C0E0E0I0I"# 	""$$$ $ %8 	C(8(J4;(W(W%(D,BBBMc   .CD* $ 6 M*;<</(/99&99""$$$##L$5$5$7$7888 0@/K++QdQfQf1B1N--ThTjTj)16G6T6` 0$77?q   -9L,-- /+ANNn|nnnooo-9* 9=8R8R%2L9
 9
5' #(+
 ->,O()/@/U+, {- 	D2Bo2U2U(,L%%(9(CL%!1S9J4<9X9X9c9h9h9j9j5k5k!k"3<"G,d33;@W;\r;-1-X-X0=?P?]. .L)*
 {- 	 ".:+,,11ImAAArE26G6TTUUXYYYl  
 ;) 	.?|.S.S  NN|-= L
 ;) 
	l&*&T&T%!1)'8'O.;$+ 'U ' '#I|| *:[)H)HlN^N^_jNkNkILL))) %?2.!'L!9!9T!A!nFWFbjnFn+7) f/@/O f f(3f f f   ,=+KN^+^(''(9;KMcddd 334EWW%6%@1%D%Dl   ;y/444M*@I@P@U* *+** * UYT_Td	* * *     55/!1+%=-% 3+I 6 	
 	
 !77/CT 8 
 
 n@@@ 599 H/DH H H   A~~ !YZZZ, P !NOOO %8 f)-|)D)D&JYJoJo!?#D#QSiK KG/1G *9)g)g!#9;K* *& =SSd<e89 *4) /+5!1L]Lgqd556GHHHmq"3.;.;/=(9(Q'!     n:::%4%!1"3.;.;/=(9(Q'!      AAA, Q !OPPP*4*'-/=!1"3.;.;/=(9(Q'!,7       555 334EFFM 'Id&H '#-B#';#A' ' 	' '#I| 4;!1+"3.;.;/=(9(Q'!      :::*%+5$+0?"3"B&7&L,7  K 'Id&H '#-7#';#A' ' 	' '#I| $4# "2"3.;.;/=(9(Q'      ::: 334EFFM +%+5$+0?"3"B&7&L,7  K 'Id&H '#-7#';#A' ' 	' '#I| $4# "2+"3.;.;/=(9(Q'      @@@*%+5$+0?"3"B&7&L 1 A,7	 	 	K 'Id&H '#-7#';#A' ' 	' '#I| *4) "2"3.;.;/=(9(Q'      FFF " ,8$5$A! 0<     ##4#DdKK ,<==BBIKKK 1 A 9 9H!(1+t44 A)(D99 (S]]a=O=O%IKKKYYPXYYYYY (%IKKK  -5     ( &IKKK%:8%D%D

)(D99 (S]]a=O=O%IKKKhh_ghhhhh (%IKKK%6x%@%@
%,,Z8888 'B-%+5$+0?"3"B&7&L,7	' 	' 	'# 'Id&H '#-7#';#A' ' 	' '#I| 04/(?!1"3.;.;/=(9(Q'    y GFrQ   r   r  r%  r  r  r  r  r  r  c           	        ;<= ||nt                      }||nt                      }||nt                      }||n| j        j        }||n| j        j        }||n| j        j        }t          |t                    r|g}|,t          j	        |          
                    |j                  nd}||n| j        j        }|	|	n| j        j        }	|
|
n| j        j        }
||n| j        j        }|r|rdnd}|r|	rdnd}|r|	rdnd}|r|
rdnd}|rJ| j        j        r>|	r|d                             d          nd}|
r|d                             d          nd}t          j        |j        d         t          j        |j                  }d}|j        d         }	 |rot          j	        |rd
nd          
                    |j                  }t-          j        |t,          j        j                   |                                d
k    r	nr|                    d          1d	|d<    | j        |fi |} | di |d	d	|	d}| j        j        r|j        d         }n|j        d         }|j        dddddf         }|                     ||| j        j        d	          }|s | j         d|| j        j        d|\  } }|                    d          }!|!tC          | j"        j#         d          t          |!d         tH          t          j%        f          r|!d         d         j        d         |k    rtC          | j"        j#         d           |||          } |||          }tL          j'        (                    |d          }"t          j)        |"d|          \  }#}$|ra|r||fz  }|	r6|| j        j        r|j*        fn|j+        fz  }| j        j        r||j,        fz  }|
r|| j        j        r|j        fn|j        fz  }g }%|d         D ]u=g }&=D ]Y}'|r+|&-                    |'.                    dd                     /|&-                    |'.                    |d                     Z|%-                    |&           v|%|d<   |rd |D             }(g g g c<;})t_          |          D ]}* | j        |$dd|*f         0                    dd          fi |}+ | di |+d	d	|	d}|(D ]#},|(|,         -                    ||,                    $| j        j        r|j        d         }-|j        }.n|j        d         }-|j        }.<-                    t          j1        |-d                     ;-                    |.           |)-                    |j        dddddf                    t          j2        <fdt_          |          D             d          }-d t_          tg          |.                    D             }/t_          tg          |.                    D ]5=t          j2        ;=fdt_          |          D             d          |/=<   6tI          |/          }.t          j4        |)d          }0ny | j        |$0                    dd          fi |}+ | di |+d	d	|	d}| j        j        r|j        d         }-|j        }.n|j        d         }-|j        }.|j        dddddf         }0|.                    |d          }1tk          |1|-|#||          }2|2
                    d          }2|$t_          tg          |$                    |2f         }3t          j2        t          j6        |-1                    d          |                    }-|-t_          |          |2ddf         }-t          j4        ||-7                    d          gd          }d}4|.D ]G=t          j2        t          j6        =|                    t_          |          |2ddf         =|4=fz  }4H|rC | j        |$dd|2f         0                    dd          fi |}5 | di |5d	ddd}6|6d         }7nt| 8                    |d	          }7d}%|7D ]V=d}&=D ]I}'t          j2        t          j6        |'|d                    }'|'t_          |          |2df         }'|&|'fz  }&J|%|&fz  }%W|%}7t          j2        t          j6        |0|                    t_          |          |2ddf         }| j        j        rd}8d}9|	r|j,        D ]G=t          j2        t          j6        =|d                    t_          |          |2df         =|8=fz  }8H|j*        D ]G=t          j2        t          j6        =|d                    t_          |          |2df         =|9=fz  }9Hts          |7|4|9pd|8pd          }ngd}:|	rO|j+        D ]G=t          j2        t          j6        =|d                    t_          |          |2df         =|:=fz  }:Htu          |7|4|:pd           }|r|r||tC          d!          |3|z  |d|z
  z  z   }3t          j4        ||3dddf         gd          }|'|;                    |3<                                           |                     ||| j        j        "          }||=                    |3>                    |j        d         d          ?                    |7                    d                    @                    d                    }|A                                dk    rd	} |||          rd	}|r|sn	||B                                 |r5| j        j        rt          |||||||#          S t          ||||$          S |S )%u  
        Generates sequences of token ids for models with a language modeling head using **contrastive search** and can
        be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.contrastive_search`] directly. Use
        generate() instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            top_k (`int`, *optional*, defaults to 1):
                The size of the candidate set that is used to re-rank for contrastive search
            penalty_alpha (`float`, *optional*, defaults to 0):
                The degeneration penalty for contrastive search; activate when it is larger than 0
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            logits_warper (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
                to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            sequential (`bool`, *optional*):
                Switches topk hidden state computation from parallel to sequential to reduce memory if True.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.ContrastiveSearchDecoderOnlyOutput`], [`~generation.ContrastiveSearchEncoderDecoderOutput`]
            or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.ContrastiveSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.ContrastiveSearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:
        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForCausalLM,
        ...     StoppingCriteriaList,
        ...     MaxLengthCriteria,
        ... )

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
        >>> model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
        >>> # set pad_token_id to eos_token_id because OPT does not have a PAD token
        >>> model.config.pad_token_id = model.config.eos_token_id
        >>> input_prompt = "DeepMind Company is"
        >>> input_ids = tokenizer(input_prompt, return_tensors="pt")
        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=64)])
        >>> outputs = model.contrastive_search(
        ...     **input_ids, penalty_alpha=0.6, top_k=4, stopping_criteria=stopping_criteria
        ... )
        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['DeepMind Company is a company that focuses on the development and commercialization of artificial intelligence (AI). DeepMind’s mission is to help people understand and solve problems that are difficult to solve in the world today.\n\nIn this post, we talk about the benefits of deep learning in business and how it']
        ```NrP   r   rE   rF   r   r   FTr  r  opr   r   )r   r  r  r   )r   r   )r   r   zQ does not support caching and therefore **can't** be used for contrastive search.z| does not have a standard cache format and therefore **can't** be used for contrastive search without further modifications.r   )r   r   r   c                     i | ]}|g S rP   rP   )r   r   s     rR   r   z6GenerationMixin.contrastive_search.<locals>.<dictcomp>  s    :::3sB:::rQ   c                      g | ]
}|         S rP   rP   )r   iall_last_hstatess     rR   
<listcomp>z6GenerationMixin.contrastive_search.<locals>.<listcomp>:  s    *U*U*U1+;A+>*U*U*UrQ   c                     g | ]}d S r   rP   )r   r  s     rR   r  z6GenerationMixin.contrastive_search.<locals>.<listcomp>;  s    %P%P%PAa%P%P%PrQ   c                 R    g | ]#}t          j        |                  d           $S r  )rL   squeeze)r   r  all_hstateslayers     rR   r  z6GenerationMixin.contrastive_search.<locals>.<listcomp>>  s.    WWWQ{1~e'<a@@WWWrQ   r  r  .)r   rY   rW   rX   )r   rF   rE   GIf `eos_token_id` is defined, make sure that `pad_token_id` is defined.r   rC   rD   rU   rV   rW   rX   rY   rC   rD   rE   rF   )Er+   r=   r   r   r   r  r   r   rL   tensorr\  r   r  r  r  r  r   r   r   r   r   r   r  
all_reduceReduceOpSUMr   r   rY   rF   r  r  r   r   r   rH   tupler   r   rb  softmaxtopkrW   rE   rX   r  r   ranger[  r  r^  r   r   _ranking_fastsplitr	  r  r   r   r  r  multiler   prodre  endrT   r\   )>r   r   r  r%  r,  r  rI  r   r   r  r  r  r  r  r  r  r   eos_token_id_tensorrD   rW   rX   rY   rU   rV   unfinished_sequencesthis_peer_finishedr   this_peer_finished_flagmodel_inputsr   last_hidden_stateslogit_for_next_step_r   
next_probstop_k_probs	top_k_idsnew_key_valuesr   r   all_outputs
all_logitsr  next_model_inputsr   next_hiddenfull_hidden_statesfinal_full_hstatesr  context_hiddenselected_idxnext_tokensnext_decoder_hidden_statesnext_model_inputselected_outputsnext_past_key_valuesnext_step_cross_attentionsnext_step_decoder_attentionsnext_step_attentionsr  r  r  s>                                                              @@@rR   rs   z"GenerationMixin.contrastive_search(  s{   P 0@/K++QdQfQf)6)BH[H]H]1B1N--ThTjTj'3'?||TE[Eh'3'?||TE[Eh#-#9ZZt?U?`
lC(( 	*(>LQ]Qiel<88;;I<LMMMos)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	  0LML$;\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf "
  %z)/!*<EJW`Wghhh"_Q'
Q	  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88  122:,0[)AtA)\\|\\ $  "044ct    ;1 C)0)Fr)J&&)0)>r)B& '.nQQQAAAX&>##GG '+{'E-1	  H     " &Hd&H '$)dk>\' '`l' 'OA| #/"2"23D"E"E"*$>2 2 2 2  
 #?1#5u|7LMM&q)!,215CC$>2 U U U   #3"29>Q"R"R"/-	;N"O"O../B.KKJ%*Z
e%L%L%L"K '   5244F$ H&9=9Wr355^e^p]r& {5 H(W-E,GG(' );96688%35)  N%&78 - -! K KD! KT%;%;A1%;%E%EFFFFT%;%;Eq%;%I%IJJJJ%%e,,,,.<L*+ ;2::':::<>B9 +zu @ @A(J(J9UVUVUVXYUY?K_K_`bdeKfKf(w(wjv(w(w%"d  +$(-1*;	   G  + > >#C(//===={5 C&-&CB&G-4-J** '.&;B&?-4-B*$++EM+q,I,IJJJ&&'9:::%%gnQQQAAAX&>???? $k*U*U*U*Ue*U*U*U[\]]]%P%Ps;M7N7N1O1O%P%P%P""3'9#:#:;;  E05WWWWW%PU,,WWW]^1 1 1&u-- &++=%>%>" :1555
 %GD$Fy~~VXZ[G\G\$m$m`l$m$m!$  ' $)-&7	    ;1 ?")"?"CK)0)F&&")"7";K)0)>& 2qqq1/AA%QAOON
 )kS`bghhL'??511L
 $E#i..$9$9<$GHK+ek+2E2E!2E2L2Le&T&TUUK%eJ&7&7qqq&HIK!&,>@U@UVW@X@X+Y_`!a!a!a)+&+ 7 7EKu$=$=>>uZ?P?PR^`a`a`a?ab*uh6**  6#E4#Eaaao.33B::$ $>J$ $  $(4 $ $&$ $).&+	$ $ $ $  (88I'J$$ (,'K'KGnr'K's's$!#1 / /EE % ) )${5;tU+J+J+JKK#E*$5$5|S$HI$("uh.NN'5$"'+ek&%.H.H"I"I%PZJ[J[]iklklklJl"m {- -/*/1,$ A!(!9 ? ? %EKu!,L,L,L M MeT^N_N_amorNr s2uh>22!(!; A A %EKu!,L,L,L M MeT^N_N_amorNr s4@44)$8*D'C'Kt%?%G4	   (*$$ 9!(!3 9 9 %EKu!,L,L,L M MeT^N_N_amorNr s,8,,0$8"<3;t    1  ''$%nooo),@@<STWkSkCll 	9k!!!T'.B"CLLLI#[__..///CC$+:X D  L
 #.';'?'?$$%8%>q%A1EEHHI\IfIfghIiIijjootuovv( ($
 (++--22)-& ! F33 *%)"! + cQ	f LLNNN" 	{- <'!'9*?'9%5*?    :'!1"7	    rQ   r?  c           	         ||nt                      }||nt                      }|*t          j        dt                     t          ||          }||n| j        j        }||n| j        j        }t          |t                    r|g}|,t          j        |                              |j                  nd}|	|	n| j        j        }	||n| j        j        }||n| j        j        }|
|
n| j        j        }
|
r|	rdnd}|
r|rdnd}|
r|rdnd}|
r|rdnd}|
rJ| j        j        r>|r|d                             d          nd}|r|d                             d          nd}t          j        |j        d         t          j        |j                  }d	}	 |rot          j        |rdnd                              |j                  }t3          j        |t2          j        j                   |                                dk    rn | j        |fi |} | di |d
||d}|r|r|j        dddddf         } |||          }|
ra|	r||fz  }|r6|| j        j        r|j         fn|j!        fz  }| j        j        r||j"        fz  }|r|| j        j        r|j#        fn|j$        fz  }t          j%        |d          }||tM          d          ||z  |d|z
  z  z   }t          j'        ||dddf         gd          }|'|(                    |)                                           | *                    ||| j        j                  }||+                    |,                    |j        d         d          -                    |.                    d                    /                    d                    }|0                                dk    rd
} |||          rd
}|r|snX||1                                 |
r5| j        j        rte          |||||||          S tg          ||||          S |S )aM  
        Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
        used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.greedy_search`] directly. Use generate()
        instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>


        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.

            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForCausalLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     StoppingCriteriaList,
        ...     MaxLengthCriteria,
        ... )

        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")

        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id

        >>> input_prompt = "It might be possible to"
        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
        ...     ]
        ... )
        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])

        >>> outputs = model.greedy_search(
        ...     input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
        ... )

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
        ```Nz`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.rP   r   rE   rF   r   r   FTr  r  r  r   r  r  r   r   r  r   r  r  r  )4r+   r=   r  r  r  r>   r   r   r   r   r   rL   r  r\  r   r  r  r  r  r   r   r   r   r   r   r  r  r  r  r   r   r  rW   rE   rX   rY   rF   argmaxr   r   r  r  r  r  r  r   r	  r  re  r  r_   rB   )r   r   r,  rI  r?  r   r   r  r  r  r  r  r  r   r  rD   rW   rX   rY   rU   rV   r  r  r  r  r   next_token_logitsnext_tokens_scoresr  s                                rR   rt   zGenerationMixin.greedy_search  s   Z 0@/K++QdQfQf1B1N--ThTjTj!Mq  
 !;;Lj Y Y'3'?||TE[Eh'3'?||TE[EhlC(( 	*(>LQ]Qiel<88;;I<LMMMos)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	  0LML$;\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf "
  %z)/!*<EJW`Wghhh"P	  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88 >4=iXX<XXL d   "3%9	   G  1  'qqq"aaax 8 "2!1)=N!O!O '   4133F$ H&9=9Wr355^e^p]r& {5 H(W-E,GG(' );96688%35)  ,'9rBBBK ''$%nooo),@@<STWkSkCll 	9k!!!T'.B"CLLLI#[__..///CC$+:X D  L
 #.';'?'?$$%8%>q%A1EEHHI\IfIfghIiIijjootuovv( ($
 (++--22)-& ! F33 *%)"! + aP	d LLNNN" 	{- 7'!'9*?'9%5*?    5'!1"7	    rQ   c           	      $	   ||nt                      }||nt                      }|*t          j        dt                     t          ||          }||nt                      }||n| j        j        }||n| j        j        }t          |t                    r|g}|,t          j        |                              |j                  nd}|
|
n| j        j        }
||n| j        j        }|	|	n| j        j        }	||n| j        j        }|r|
rdnd}|r|rdnd}|r|rdnd}|r|	rdnd}|rJ| j        j        r>|r|d                             d          nd}|	r|d                             d          nd}t          j        |j        d         t          j        |j                  }d	}	 |rot          j        |rdnd                              |j                  }t3          j        |t2          j        j                   |                                dk    rn' | j        |fi |} | di |d
||	d}|r|r|j        dddddf         } |||          } |||          }|ra|
r||fz  }|r6|| j        j        r|j         fn|j!        fz  }| j        j        r||j"        fz  }|	r|| j        j        r|j#        fn|j$        fz  }tJ          j&        '                    |d          }t          j(        |d          )                    d          }||tU          d          ||z  |d|z
  z  z   }t          j+        ||dddf         gd          }|'|,                    |-                                           | .                    ||| j        j                  }||/                    |0                    |j        d         d          1                    |2                    d                    3                    d                    }|4                                dk    rd
} |||          rd
}|r|sn||5                                 |r5| j        j        rtm          |||||||          S to          ||||          S |S )a  
        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.sample`] directly. Use generate() instead.
        For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            logits_warper (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
                to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.SampleDecoderOnlyOutput`], [`~generation.SampleEncoderDecoderOutput`] or `torch.LongTensor`:
            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.SampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.SampleEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForCausalLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     TopKLogitsWarper,
        ...     TemperatureLogitsWarper,
        ...     StoppingCriteriaList,
        ...     MaxLengthCriteria,
        ... )
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")

        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
        >>> model.config.pad_token_id = model.config.eos_token_id
        >>> model.generation_config.pad_token_id = model.config.eos_token_id

        >>> input_prompt = "Today is a beautiful day, and"
        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         MinLengthLogitsProcessor(15, eos_token_id=model.generation_config.eos_token_id),
        ...     ]
        ... )
        >>> # instantiate logits processors
        >>> logits_warper = LogitsProcessorList(
        ...     [
        ...         TopKLogitsWarper(50),
        ...         TemperatureLogitsWarper(0.7),
        ...     ]
        ... )

        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])

        >>> torch.manual_seed(0)  # doctest: +IGNORE_RESULT
        >>> outputs = model.sample(
        ...     input_ids,
        ...     logits_processor=logits_processor,
        ...     logits_warper=logits_warper,
        ...     stopping_criteria=stopping_criteria,
        ... )

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Today is a beautiful day, and we must do everything possible to make it a day of celebration.']
        ```N`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.rP   r   rE   rF   r   r   FTr  r  r  r  r   r   r   num_samplesr  r  r  r  )8r+   r=   r  r  r  r>   r   r   r   r   r   rL   r  r\  r   r  r  r  r  r   r   r   r   r   r   r  r  r  r  r   r   r  rW   rE   rX   rY   rF   r   rb  r  multinomialr  r   r   r  r  r  r  r  r   r	  r  re  r  rc   ra   )r   r   r,  rI  r  r?  r   r   r  r  r  r  r  r  r   r  rD   rW   rX   rY   rU   rV   r  r  r  r  r   r  next_token_scoresprobsr  s                                  rR   ru   zGenerationMixin.sample	  s-   @ 0@/K++QdQfQf1B1N--ThTjTj!Mo  
 !;;Lj Y Y)6)BH[H]H]'3'?||TE[Eh'3'?||TE[EhlC(( 	*(>LQ]Qiel<88;;I<LMMMos)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	  0LML$;\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf "
  %z)/!*<EJW`Wghhh"R	  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88 >4=iXX<XXL d   "3%9	   G  1  'qqq"aaax 8 !1 0<M N N -i9J K K '   3022F$ H&9=9Wr355^e^p]r& {5 H(W-E,GG(' );96688%35) M))*;)DDE+EqAAAII!LLK ''$%nooo),@@<STWkSkCll 	9k!!!T'.B"CLLLI#[__..///CC$+:X D  L
 #.';'?'?$$%8%>q%A1EEHHI\IfIfghIiIijjootuovv( ($
 (++--22)-& ! F33 *%)"! + eR	h LLNNN" 	{- 1'!'9*?'9%5*?    /'!1"7	    rQ   r  c                 F  '( ||nt                      }||nt                      }|*t          j        dt                     t          ||          }t          |          dk    rt          j        dt                     ||n| j        j        }||n| j        j	        }t          |t                    r|g}|
|
n| j        j        }
||n| j        j        }|	|	n| j        j        }	||n| j        j        }t          |j                  }|j        }|j        \  }}||z  |k    rt'          d||z   d| d          |r|
rdnd}|r(|
r&t)          d	 t+          |          D                       nd(|r|rdnd}|r|rdnd}|r|	rdnd}|rJ| j        j        r>|r|d
                             d          nd}|	r|d
                             d          nd}t3          j        ||ft2          j        |j                  }d|ddddf<   |                    ||z  f          }d}	 |rot3          j        |rdnd                              |j                  }tA          j!        |t@          j"        j#                   |$                                dk    rn| | j%        |fi |} | di |d||	d}|r|r|dz   }|j&        dddddf         }tN          j(        )                    |d          } |||          }||dddf         *                    |          z   }|ra|
r||fz  }|r6|| j        j        r|j+        fn|j,        fz  }| j        j        r||j-        fz  }|	r|| j        j        r|j.        fn|j/        fz  }|j        d         } |                    ||| z            }|rt          |          nd}!t3          j0        |tc          dd|!z             |z  ddd          \  }}"t3          j2        |"| d          }#|"| z  }"|3                    |||"|#||(          }$|$d         }|$d         }%|$d         't3          j4        |'ddf         |%5                    d          gd          }| 6                    ||| j        j                   }|d!         | 7                    |d!         '          |d!<   |r8|
r6t)          '(fd"t+          t          (                    D                       (|dz   }|j8        s |||          r|snd}|9                    |||"|#|||j:        (#          }&|rd|
sd|&d$<   | j        j        r*tw          |&d%         |&d$         ||&d&         |||||'	  	        S ty          |&d%         |&d$         ||&d&         ||(          S |&d%         S ))a  
        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.beam_search`] directly. Use generate()
        instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            beam_scorer (`BeamScorer`):
                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.


        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForSeq2SeqLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     BeamSearchScorer,
        ... )
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

        >>> encoder_input_str = "translate English to German: How old are you?"
        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids


        >>> # lets run beam search using 3 beams
        >>> num_beams = 3
        >>> # define decoder start token ids
        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
        >>> input_ids = input_ids * model.config.decoder_start_token_id

        >>> # add encoder_outputs to model keyword arguments
        >>> model_kwargs = {
        ...     "encoder_outputs": model.get_encoder()(
        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
        ...     )
        ... }

        >>> # instantiate beam scorer
        >>> beam_scorer = BeamSearchScorer(
        ...     batch_size=1,
        ...     num_beams=num_beams,
        ...     device=model.device,
        ... )

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
        ...     ]
        ... )

        >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Wie alt bist du?']
        ```Nr  r   KYou don't have defined any stopping_criteria, this will likely loop forever)Batch dimension of `input_ids` should be 	, but is rR  rP   c              3      K   | ]}d V  dS rP   NrP   r   r  s     rR   r   z.GenerationMixin.beam_search.<locals>.<genexpr>  "      55"555555rQ   r   rE   rF   r       er   FTr  r  r  r  r   r   r   r   largestsortedfloorrounding_moder   r   rg   next_beam_scoresnext_beam_tokensnext_beam_indicesr  r   c              3   H   K   | ]}|                  |         fz   V  d S r   rP   r   r  r  rg   s     rR   r   z.GenerationMixin.beam_search.<locals>.<genexpr>  8      %s%sUVl8A;&?8A;.&P%s%s%s%s%s%srQ   r   r   r?  rg   sequence_scoresrC   rg   	rC   rf   rD   rg   rU   rV   rW   rX   rY   rC   rf   rD   rg   rE   rF   )=r+   r=   r  r  r  r>   r   r   r   r   r   r   r  r  r  r  
_beam_hypsr  r   r   r  r  r   r   r   rL   zerosfloatr   r[  r  r\  r  r  r  r  r   r   r  r   rb  rc  	expand_asrW   rE   rX   rY   rF   r  re  divprocessr   r	  r  r  is_donefinalizer?  rk   re   ))r   r   r  r,  rI  r?  r   r   r  r  r  r  r  r   r   r  batch_beam_sizecur_lenrD   rW   rX   rY   rU   rV   beam_scoresr  r  r  r   r  r
  next_token_scores_processedra  n_eos_tokensr  next_indicesbeam_outputsbeam_next_tokenssequence_outputsr  rg   s)                                          @@rR   rw   zGenerationMixin.beam_search  s   v 0@/K++QdQfQf1B1N--ThTjTj!Mo  
 !;;Lj Y Y !!Q&&Mgituuu'3'?||TE[Eh'3'?||TE[EhlC(( 	*(>L)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	  /00
)	#,? z!_44oI
<Roo]looo  
 0LML:QnVcnE55eO44555555jn 	 %<\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf " k:y"9U^Uefff!AAAqrrE!&&
Y(>'@AA"`	.  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88=4=iXX<XXLd   "3%9	   G  1 !A+ 'qqq"aaax 8 " 9 9!r !: ! ! +;*:9FW*X*X' ;k!!!T'>R>\>\]n>o>o o '   =:<<F$ H&9=9Wr355^e^p]r& {5 H(W-E,GG(' );96688%35) +04J 1 6 6z9zCY Z Z 1=C3|,,,!L-2Z!3q!l*:#;#;i#GQX\ei. . .*{ !9[*GTTTL%
2K '..!))) /  L ''9:K+,>?#$78H	9Xqqq[#9;K;U;UVX;Y;Y"Z`bcccICC$+:X D  L -.:262E2ElSdFego2p2p./& u= u$%s%s%s%s%sZ_`cdp`q`qZrZr%s%s%stt kG" .&7&7	6&J&J ." .)-&A`	.D '//%%(3% 0 	
 	
 # 	1  ;6: !23{- 5.{;%56G%H!!1.!A'9*?'9%5*?
 
 
 
 3.{;%56G%H!!1.!A1"7    $K00rQ   c                 8  )* ||nt                      }||nt                      }|*t          j        dt                     t          ||          }||n| j        j        }||n| j        j        }t          |t                    r|g}||n| j        j        }|	|	n| j        j        }	|
|
n| j        j        }
||n| j        j        }t          |j                  }|j        }|j        \  }}|r|rdnd}|r(|r&t'          d t)          |          D                       nd*|r|	rdnd}|r|	rdnd}|r|
rdnd}|rJ| j        j        r>|	r|d                             d          nd}|
r|d                             d          nd}t1          j        ||ft0          j        |j                  }|                    ||z  f          }d	}	 |rot1          j        |rdnd                              |j                  }t?          j         |t>          j!        j"                   |#                                dk    rn | j$        |fi |} | di |d
|	|
d}|r|r|dz   }|j%        dddddf         }tL          j'        (                    |d          } |||          } | |dddf         )                    |          z   } |||          }|rk|r| |||           fz  }|	r6|| j        j        r|j*        fn|j+        fz  }| j        j        r||j,        fz  }|
r|| j        j        r|j-        fn|j.        fz  }|j        d         }!|                    |||!z            }tL          j'        /                    |d          }"t1          j0        |"d|z            }#t1          j1        |d|#          }t1          j2        |d
d          \  }}$t1          j1        |#d|$          }#t1          j3        |#|!d          }%|#|!z  }#|4                    |||#|%||*          }&|&d         }|&d         }'|&d         )t1          j5        |)ddf         |'6                    d          gd          }| 7                    ||| j        j                  }|d         | 8                    |d         )          |d<   |r8|r6t'          )*fdt)          t          *                    D                       *|dz   }|j9        s |||          r|snd
}@|:                    |||#|%|||j;        *          }(|rd|sd|(d<   | j        j        r*ty          |(d          |(d         ||(d!         |||||"	  	        S t{          |(d          |(d         ||(d!         ||#          S |(d          S )$a  
        Generates sequences of token ids for models with a language modeling head using **beam search multinomial
        sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.beam_sample`] directly. Use generate()
        instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            beam_scorer (`BeamScorer`):
                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            logits_warper (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
                to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.BeamSampleDecoderOnlyOutput`], [`~generation.BeamSampleEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.BeamSampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.BeamSampleEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForSeq2SeqLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     TopKLogitsWarper,
        ...     TemperatureLogitsWarper,
        ...     BeamSearchScorer,
        ... )
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

        >>> encoder_input_str = "translate English to German: How old are you?"
        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids

        >>> # lets run beam search using 3 beams
        >>> num_beams = 3
        >>> # define decoder start token ids
        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
        >>> input_ids = input_ids * model.config.decoder_start_token_id

        >>> # add encoder_outputs to model keyword arguments
        >>> model_kwargs = {
        ...     "encoder_outputs": model.get_encoder()(
        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
        ...     )
        ... }

        >>> # instantiate beam scorer
        >>> beam_scorer = BeamSearchScorer(
        ...     batch_size=1,
        ...     max_length=model.config.max_length,
        ...     num_beams=num_beams,
        ...     device=model.device,
        ... )

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)]
        ... )
        >>> # instantiate logits processors
        >>> logits_warper = LogitsProcessorList(
        ...     [
        ...         TopKLogitsWarper(50),
        ...         TemperatureLogitsWarper(0.7),
        ...     ]
        ... )

        >>> outputs = model.beam_sample(
        ...     input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
        ... )

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Wie alt bist du?']
        ```Nr  rP   c              3      K   | ]}d V  dS r  rP   r  s     rR   r   z.GenerationMixin.beam_sample.<locals>.<genexpr>  r  rQ   r   rE   rF   r   FTr  r  r  r  r   r   r   r   r  )
descendingr   r  r  r  r  r  r  r  r   c              3   H   K   | ]}|                  |         fz   V  d S r   rP   r   s     rR   r   z.GenerationMixin.beam_sample.<locals>.<genexpr>Z  r!  rQ   r"  r#  rC   rg   r$  r%  )>r+   r=   r  r  r  r>   r   r   r   r   r   r  r  r  r  r   r&  r  r   r  r  r   r   r   rL   r'  r(  r   r[  r  r\  r  r  r  r  r   r   r  r   rb  rc  r)  rW   rE   rX   rY   rF   r  r	  rg  sortr*  r+  r   r	  r  r  r,  r-  r?  rp   rn   )+r   r   r  r,  rI  r  r?  r   r   r  r  r  r  r  r   r   r  r.  r/  rD   rW   rX   rY   rU   rV   r0  r  r  r  r   r  r
  r1  ra  r  r  _indicesr3  r4  r5  r6  r  rg   s+                                            @@rR   rx   zGenerationMixin.beam_sampleB  s   P 0@/K++QdQfQf1B1N--ThTjTj!Mo  
 !;;Lj Y Y'3'?||TE[Eh'3'?||TE[EhlC(( 	*(>L)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	  /00
)	#,?  0LML:QnVcnE55eO44555555jn 	 %<\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf " k:y"9U^Uefff!&&
Y(>'@AA"f	.  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88=4=iXX<XXLd   "3%9	   G  1 !A+ 'qqq"aaax 8 " 9 9!r !: ! ! +;*:9FW*X*X' ;k!!!T'>R>\>\]n>o>o o !.i9J K K '   W}}Y8STTVVF$ H&9=9Wr355^e^p]r& {5 H(W-E,GG(' );96688%35) +04J 1 6 6z9zCY Z ZM))*;)DDE+Eq9}MMMK %->K P P*/*5FSW]^*_*_*_'x,{BAAK 9[*GTTTL%
2K '..!))) /  L ''9:K+,>?#$78H	9Xqqq[#9;K;U;UVX;Y;Y"Z`bcccICC$+:X D  L -.:262E2ElSdFego2p2p./& u= u$%s%s%s%s%sZ_`cdp`q`qZrZr%s%s%stt kG" .&7&7	6&J&J ." .)-&Mf	.P '//%%(3% 0 	
 	
 # 	1  ;6: !23{- 5.{;%56G%H!!1.!A'9*?'9%5*?
 
 
 
 3.{;%56G%H!!1.!A1"7    $K00rQ   c                 N  1234567 ||nt                      }||nt                      }|*t          j        dt                     t          ||          }||n| j        j        }||n| j        j        }t          |t                    r|g}|
|
n| j        j        }
||n| j        j        }|	|	n| j        j        }	||n| j        j        }|j        6|j        }6|z  7t#          |j                  |z  2|j        }|j        \  }}|r|
r27fdt+          |          D             5nd562z  |k    rt-          d62z   d| d          |r|
rdnd}|r|rdnd}|r|rdnd}|r|	rdnd}|rJ| j        j        r>|r|d                             d	          nd}|	r|d                             d
          nd}t5          j        26fdt4          j        |          }d|dddd7f<   |                    26z  f          }d}	 |rot5          j        |rdnd                              |j                  }tA          j!        |t@          j"        j#                   |$                                dk    rn
t5          j%        26z  |j&        |          }t5          j%        26z  t4          j'        |          } | j(        |fi |} | di |d||	d}|r|r|dz   }|
r&t5          j)        |j*        dddddf                   }t+          |          D ]r337z  } tW          | 7z   6          }!|!| z
  }"g }#t+          2          D ]21|#,                    16fdt+          | |!          D                        3||#         }$|j*        |#dddf         }%tZ          j.        /                    |%d          }&|&j        d         }' ||$|&|3          }(|(||#         0                    d          z   }&|&1                    |(          }&|
r|(||#<   |&                    2|"|'z            }&|rt#          |          nd})t5          j2        |&tg          dd|)z             |"z  ddd          \  }&}*t5          j4        |*|'d          }+|*|'z  }*5tk          5d          nd},|6                    |$|&|*|+|||,3          }-|-d         ||#<   |-d         }.|-d          4|rB|
r@to          345fd!t+          t#          5d                             D                       53<   |$4         ||#<   t5          j8        |$4ddf         |.0                    d          gd          }$|$dddf         ||#<   6t5          j4        4|"d          z  | z   4|"z  z   ||#<   t|ra|
r||fz  }|r6|| j        j        r|j9        fn|j:        fz  }| j        j        r||j;        fz  }|	r|| j        j        r|j<        fn|j=        fz  }t5          j8        ||0                    d          gd          }| >                    ||| j        j        "          }|d#         | ?                    |d#         |          |d#<   |dz   }|j@        s |||          r|snd}{5tk          5d          nd}/|A                    |||*|+|||jB        |/$          }0|rd|
sd|0d%<   | j        j        r*t          |0d&         |0d%         ||0d'         |||||(	  	        S t          |0d&         |0d%         ||0d'         ||)          S |0d&         S )*a  
        Generates sequences of token ids for models with a language modeling head using **diverse beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.group_beam_search`] directly. Use
        generate() instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            beam_scorer (`BeamScorer`):
                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)

            model_kwargs:
                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
                model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.BeamSearchDecoderOnlyOutput`] if [`~generation.BeamSearchDecoderOnlyOutput`] if
            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
            [`~generation.BeamSearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForSeq2SeqLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     HammingDiversityLogitsProcessor,
        ...     BeamSearchScorer,
        ... )
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

        >>> encoder_input_str = "translate English to German: How old are you?"
        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids


        >>> # lets run diverse beam search using 6 beams
        >>> num_beams = 6
        >>> # define decoder start token ids
        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
        >>> input_ids = input_ids * model.config.decoder_start_token_id

        >>> # add encoder_outputs to model keyword arguments
        >>> model_kwargs = {
        ...     "encoder_outputs": model.get_encoder()(
        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
        ...     )
        ... }

        >>> # instantiate beam scorer
        >>> beam_scorer = BeamSearchScorer(
        ...     batch_size=1,
        ...     max_length=model.config.max_length,
        ...     num_beams=num_beams,
        ...     device=model.device,
        ...     num_beam_groups=3,
        ... )

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
        ...     ]
        ... )

        >>> outputs = model.group_beam_search(
        ...     input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs
        ... )

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Wie alt bist du?']
        ```Nr  c           	      b    g | ]+}t          d  t          z            D                       ,S )c              3      K   | ]}d V  dS r  rP   r  s     rR   r   z?GenerationMixin.group_beam_search.<locals>.<listcomp>.<genexpr>0  s"      !P!P"!P!P!P!P!P!PrQ   )r  r  )r   r  r   num_sub_beamss     rR   r  z5GenerationMixin.group_beam_search.<locals>.<listcomp>0  s>    qqqUVE!P!PeMJ4N.O.O!P!P!PPPqqqrQ   r  r  rR  rP   r   rE   rF   r  r   r   FTr  r  r  r  r   r   c                      g | ]
}z  |z   S rP   rP   )r   idx	batch_idxr  s     rR   r  z5GenerationMixin.group_beam_search.<locals>.<listcomp>x  s#    fffY.4fffrQ   r   )current_tokensbeam_group_idxr   r  r  r  )r   r   rg   group_indexr  r  r  c              3   T   K   | ]"}         |                  |         fz   V  #d S r   rP   )r   r  rE  r  rg   s     rR   r   z4GenerationMixin.group_beam_search.<locals>.<genexpr>  sM       9 9WX^4Xa[AXa[NR9 9 9 9 9 9rQ   r  r   r"  r#  rC   rg   r$  r%  )Er+   r=   r  r  r  r>   r   r   r   r   r   r  r  r  r  r  r&  r   r&  r   r   r  r   r   r   r   rL   fullr(  r[  r  r\  r  r  r  r  r   r'  r   r   r   
zeros_liker  minrT  r   rb  rc  r	  r)  r  re  r*  rd  r+  r  r   rW   rE   rX   rY   rF   r  r  r,  r-  r?  rk   re   )8r   r   r  r,  rI  r?  r   r   r  r  r  r  r  r   r&  r   r.  r/  rD   rW   rX   rY   rU   rV   r0  r  r  rD  reordering_indicesr  r   processed_scoregroup_start_idxgroup_end_idx
group_sizebatch_group_indicesgroup_input_idsr  r
  ra  r1  r2  r  r3  process_beam_indicesr4  r5  final_beam_indicesr6  rC  r   rE  r  rg   r  r@  s8                                                    @@@@@@@rR   rz   z!GenerationMixin.group_beam_search  sC
   B 0@/K++QdQfQf1B1N--ThTjTj!Mo  
 !;;Lj Y Y'3'?||TE[Eh'3'?||TE[EhlC(( 	*(>L)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	   )	%5!_4/00OC
!#,? " 	 } 	 qqqqqZ_`oZpZpqqqLLLz!_44oI
<Roo]looo  
 0LML$;\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf " j*i!8$ekZ`aaa*+AAA&'!&&
Y(>'@AA"P	.  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88 #[i)?y_efffN "'Z)-C5:^d!e!e!e >4=iXX<XXLd   "3%9	   G  1 !A+ M"'"27>!!!R(3K"L"L"'"8"8 J J"0="@ #Om$CY O O*_<
 ')#!&z!2!2  I'..fffffoWd@e@efff    #,,?"@ %,N3FAAA3M$N!$&M$=$=%2 %> % %! /4R8
.>.>#%6~ft/ / /+ %@+NaBbBlBlmoBpBp$p!$5$?$?@[$\$\!  W;VO$78 %6$:$::zT^G^$_$_! 5AGs<000a16%s1a,.>'?'?*'LRS]ajn2 2 2.!;  %yjPWXXX)J6 AM@Xs<'<'<'<^b$*22#% !-!-!5 .  3 	  	  4@@R3S/0#/0B#C '(;<* } 38 9 9 9 9 9 9\abefrstfubvbv\w\w9 9 9 4 4L0 2A1J	-."')_Xqqq[-IK[KeKefhKiKi,jpr"s"s"s6Eaaae6L23
 	(Jg V V VV%&*,. ##677 '   100F$ H&9=9Wr355^e^p]r& {5 H(W-E,GG(' );96688%35) 	9n.F.Fr.J.J"KQSTTTICC$+:X D  L -.:262E2E !235G3 3./
 kG" .&7&7	6&J&J ." .)-&aP	.d 7C6NSr222TX&//%%(3+ 0 	
 	
 # 	1  ;6: !23{- 5.{;%56G%H!!1.!A'9*?'9%5*?
 
 
 
 3.{;%56G%H!!1.!A1"7    $K00rQ   r  c                 p  () ||nt                      }||nt                      }|*t          j        dt                     t          ||          }t          |          dk    rt          j        dt                     ||n| j        j        }||n| j        j	        }t          |t                    r|g}|
|
n| j        j        }
||n| j        j        }|	|	n| j        j        }	||n| j        j        }t          |j                  }|j        }|j        \  }}||z  |k    rt'          d||z   d| d          |r|
rdnd}|r(|
r&t)          d	 t+          |          D                       nd)|r|rdnd}|r|rdnd}|r|	rdnd}|rJ| j        j        r>|r|d
                             d          nd}|	r|d
                             d          nd}t3          j        ||ft2          j        |j                  }d|ddddf<   |                    ||z  f          }d}	 |rot3          j        |rdnd                              |j                  }tA          j!        |t@          j"        j#                   |$                                dk    rn | j%        |fi |} | di |d||	d}|r|r|dz   }|j&        dddddf         }tN          j(        )                    |d          } |||          }||dddf         *                    |          z   }|+                                } |ra|
r||fz  }|r6|| j        j        r|j,        fn|j-        fz  }| j        j        r||j.        fz  }|	r|| j        j        r|j/        fn|j0        fz  }|j        d         }!|                    |||!z            }|rt          |          nd}"t3          j1        |te          dd|"z             |z  ddd          \  }}#|#|!z  3                                }$|#|!z  }#|4                    |||#|$| ||)          }%|%d         }|%d         }&|%d         (t3          j5        |(ddf         |&6                    d          gd          }| 7                    ||| j        j                  }|d         | 8                    |d         (          |d<   |r8|
r6t)          ()fd t+          t          )                    D                       )|dz   }|j9        s |||          r|snd}|:                    |||#|$|||j;        )!          }'|rd|
sd|'d"<   | j        j        r*ty          |'d#         |'d"         ||'d$         |||||%	  	        S t{          |'d#         |'d"         ||'d$         ||&          S |'d#         S )'a  
        Generates sequences of token ids for models with a language modeling head using **constrained beam search
        decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.constrained_beam_search`] directly. Use
        generate() instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            constrained_beam_scorer (`ConstrainedBeamSearchScorer`):
                A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
                sorted during generation, while satisfying a list of positive constraints. For more information, the
                documentation of [`ConstrainedBeamSearchScorer`] should be read.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            logits_warper (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
                to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            max_length (`int`, *optional*, defaults to 20):
                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                tokens. The maximum length of the sequence to be generated.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`generation.BeamSearchDecoderOnlyOutput`], [`~generation.BeamSearchEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.BeamSearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.


        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForSeq2SeqLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     ConstrainedBeamSearchScorer,
        ...     PhrasalConstraint,
        ... )
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

        >>> encoder_input_str = "translate English to German: How old are you?"
        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids


        >>> # lets run beam search using 3 beams
        >>> num_beams = 3
        >>> # define decoder start token ids
        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
        >>> input_ids = input_ids * model.config.decoder_start_token_id

        >>> # add encoder_outputs to model keyword arguments
        >>> model_kwargs = {
        ...     "encoder_outputs": model.get_encoder()(
        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
        ...     )
        ... }

        >>> constraint_str = "Sie"
        >>> constraint_token_ids = tokenizer.encode(constraint_str)[:-1]  # slice to remove eos token
        >>> constraints = [PhrasalConstraint(token_ids=constraint_token_ids)]


        >>> # instantiate beam scorer
        >>> beam_scorer = ConstrainedBeamSearchScorer(
        ...     batch_size=1, num_beams=num_beams, device=model.device, constraints=constraints
        ... )

        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
        ...     ]
        ... )

        >>> outputs = model.constrained_beam_search(
        ...     input_ids, beam_scorer, constraints=constraints, logits_processor=logits_processor, **model_kwargs
        ... )

        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ['Wie alt sind Sie?']
        ```Nr  r   r  r  r  rR  rP   c              3      K   | ]}d V  dS r  rP   r  s     rR   r   z:GenerationMixin.constrained_beam_search.<locals>.<genexpr>  r  rQ   r   rE   rF   r   r  r   FTr  r  r  r  r   r   r   r  r  r  r  r  r  r   c              3   H   K   | ]}|                  |         fz   V  d S r   rP   r   s     rR   r   z:GenerationMixin.constrained_beam_search.<locals>.<genexpr>#  r!  rQ   r"  r#  rC   rg   r$  r%  )>r+   r=   r  r  r  r>   r   r   r   r   r   r   r  r  r  r  r&  r  r   r   r  r  r   r   r   rL   r'  r(  r   r[  r  r\  r  r  r  r  r   r   r  r   rb  rc  r)  rf  rW   rE   rX   rY   rF   r  re  r   r+  r   r	  r  r  r,  r-  r?  rk   re   )*r   r   r  r,  rI  r?  r   r   r  r  r  r  r  r   r   r  r.  r/  rD   rW   rX   rY   rU   rV   r0  r  r  r  r   r  r
  r1  scores_for_all_vocabra  r2  r  r3  r4  r5  r6  r  rg   s*                                           @@rR   ry   z'GenerationMixin.constrained_beam_search  s   L 0@/K++QdQfQf1B1N--ThTjTj!Mo  
 !;;Lj Y Y !!Q&&Mgituuu'3'?||TE[Eh'3'?||TE[EhlC(( 	*(>L)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	  0;<<
+5	#,? z!_44oI
<Roo]looo  
 0LML:QnVcnE55eO44555555jn 	 %<\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf " k:y"9U^Uefff!AAAqrrE!&&
Y(>'@AA"b	.  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88=4=iXX<XXLd   "3%9	   G  1 !A+ 'qqq"aaax 8 " 9 9!r !: ! ! +;*:9FW*X*X' ;k!!!T'>R>\>\]n>o>o o#4#:#:#<#<  '   3022F$ H&9=9Wr355^e^p]r& {5 H(W-E,GG(' );96688%35) +04J 1 6 6z9zCY Z Z 1=C3|,,,!L-2Z!3q!l*:#;#;i#GQX\ei. . .*{ (*4::<<L%
2K 3::!$))) ; 	 	L ''9:K+,>?#$78H	9Xqqq[#9;K;U;UVX;Y;Y"Z`bcccICC$+:X D  L -.:262E2ElSdFego2p2p./& u= u$%s%s%s%s%sZ_`cdp`q`qZrZr%s%s%stt kG&. .2C2CIv2V2V ." .)-&Eb	.H 3;;%%(3% < 	
 	
 # 	1  ;6: !23{- 5.{;%56G%H!!1.!A'9*?'9%5*?
 
 
 
 3.{;%56G%H!!1.!A1"7    $K00rQ   r$  c           	        3 t          |d          sd|_        ||nt                      }||nt                      }||nt                      }||n| j        j        }||n| j        j        }||t          d          t          |t                    r|g}|,t          j        |                              |j                  nd}||n| j        j        }|	|	n| j        j        }	|
|
n| j        j        }
||n| j        j        }|r|rdnd}|r|	rdnd}|r|	rdnd}|r|
rdnd}|rJ| j        j        r>|	r|d                             d          nd}|
r|d                             d          nd}|                    |j        d	                                       d
          }|d	         j        }d|j        j                                        v s2|j        j        (d|j        j        d	                                         v rd
nd	}d}	 |rot          j        |rdnd                              |j                  }t;          j        |t:          j        j                    |!                                dk    rn
|j        d         }|}tE          t          |j                            D ]}d|v r|d         d	         |         j        d         }|j        d
         |z
  } |dd|  df         }!t          j#        |          }"|j        j        r ||!|"|d         |d                   }#n@ ||!|"|d                   }#n+|j        j        r |||d                   }#n ||          }#|#j$        |d<   tK          |          d	k    r- |||#j&        dddddf                   |#j&        dddddf<   |#j&        dddddf         '                    d          }$t          j(        ||$dddf         fd          }|v|$)                    |j        d	         d
          }%|%*                    |+                    d
                    ,                    d	          -                                 }%|%r nd}%|j        d
         |j        d
         z
  }&d|v rit          j#        |          }'|dd|& d
z
  df         }(| j        j        r | |(|'|d         |d         |	|
d          })nJ | |(|'|d         |	|
d          })n2| j        j        r | ||d         |	|
d          })n | ||	|
d          })|)j&        dd|& d
z
  df         3tK          |          d	k    rDtE          |&          D ]4}* ||ddd||*z   f         3dd|*ddf                   3dd|*ddf<   5tK          |          d	k    rDtE          |&          D ]4}* ||ddd||*z   f         3dd|*ddf                   3dd|*ddf<   5|rj3dd|& d
z
  dddf         .                    d          }+t          j/        |+d	ddddf         d
          0                    d
          dddf         },n)3dd|& d
z
  dddf         '                    d          },|dd|& df         }-|-|,ddddf         k     1                    d          d
k     2                                }.|%r|.|&k    r|.d
z  }.tg          |.||z
  d
z
            }.|,ddd|.d
z   f         }/t          j(        ||/fd          }|'|4                    |/5                                           |j        d         }0|0d
z
  }1tm          | |)j$        |1          |)_$        tm          ||d         |1d
z
            |d<   |.t          |j                  k    r|xj        dz  c_        nto          d|j        dz
            |_        |r|r|r|r.|tq          3fd tE          |.d
z             D                       z  }d|vr|0}2n|.d
z   }2|	rV| j        j        r1ts          ||)j:        ||2          }ts          ||)j;        ||2d!          }nts          ||)j<        ||2d!          }|
r;| j        j        rts          ||)j=        ||2          }nts          ||)j>        ||2          }| ?                    |)|| j        j        "          }||@                    |dddf         )                    |j        d	         d
          *                    |+                    d
                    ,                    d	                    }|7                                d	k    rd} |||          rd}|r|sn{||A                                 |r5| j        j        rt          |||||||#          S t          ||||$          S |S )%a  
        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
        **sample** (depending on `do_sample`), assisted by a smaller model. Can be used for text-decoder, text-to-text,
        speech-to-text, and vision-to-text models.

        <Tip warning={true}>

        In most cases, you do not need to call [`~generation.GenerationMixin.assisted_decoding`] directly. Use
        generate() instead. For an overview of generation strategies and code examples, check the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            assistant_model (`PreTrainedModel`, *optional*):
                An assistant model that can be used to accelerate generation. The assistant model must have the exact
                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
                is much faster than running generation with the model you're calling generate from. As such, the
                assistant model should be much smaller.
            do_sample (`bool`, *optional*, defaults to `False`):
                Whether or not to use sampling ; use greedy decoding otherwise.
            logits_processor (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            logits_warper (`LogitsProcessorList`, *optional*):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
                to warp the prediction score distribution of the language modeling head applied before multinomial
                sampling at each generation step.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            pad_token_id (`int`, *optional*):
                The id of the *padding* token.
            eos_token_id (`Union[int, List[int]]`, *optional*):
                The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more details.
            output_hidden_states (`bool`, *optional*, defaults to `False`):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more details.
            output_scores (`bool`, *optional*, defaults to `False`):
                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
            synced_gpus (`bool`, *optional*, defaults to `False`):
                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GreedySearchDecoderOnlyOutput`], [`~generation.GreedySearchEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GreedySearchEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.

        Examples:

        ```python
        >>> from transformers import (
        ...     AutoTokenizer,
        ...     AutoModelForCausalLM,
        ...     LogitsProcessorList,
        ...     MinLengthLogitsProcessor,
        ...     StoppingCriteriaList,
        ...     MaxLengthCriteria,
        ... )

        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
        >>> assistant_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
        >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
        >>> model.generation_config.pad_token_id = model.generation_config.eos_token_id
        >>> input_prompt = "It might be possible to"
        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
        >>> # instantiate logits processors
        >>> logits_processor = LogitsProcessorList(
        ...     [
        ...         MinLengthLogitsProcessor(10, eos_token_id=model.generation_config.eos_token_id),
        ...     ]
        ... )
        >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])
        >>> outputs = model.assisted_decoding(
        ...     input_ids,
        ...     assistant_model=assistant_model,
        ...     logits_processor=logits_processor,
        ...     stopping_criteria=stopping_criteria,
        ... )
        >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
        ```max_assistant_tokens   Nr  rP   r   rE   rF   r   r   bloomFTr  r  r  r   assistant_past_key_valuesr  )r   r   r   r   )r  r   )r   r   r   r   )r   r   r   r   r  r  r   )r  r   r  r  r   )r   r   r  r  r   )r  r  r   r  g       @c              3   6   K   | ]}d d |d d f         V  d S r   rP   )r   r  
new_logitss     rR   r   z4GenerationMixin.assisted_decoding.<locals>.<genexpr>  s6      #V#VAJqqq!QQQw$7#V#V#V#V#V#VrQ   )is_decoder_attentionr  r  r  )Dr   rY  r+   r=   r   r   r   r   r   r   rL   r  r\  r   r  r  r  r  r   r   r   newr   fill_r?  r   rH   r   architecturesr  r  r  r  r   r  r   r   r   r  r  r   r  r   r	  r  boolr  r	  r  cumsumrd  rJ  r  r  _crop_past_key_valuesre  r  _split_model_outputsrX   rW   rE   rY   rF   r  r  r  r_   rB   )4r   r   r   r$  r,  r  rI  r   r   r  r  r  r  r  r  r   r  rD   rW   rX   rY   rU   rV   r  max_lenassistant_kv_indexingr  r  r/  candidate_input_idsr  prev_seq_lennew_token_lenassist_inputsassist_attnassistant_model_outputs	new_tokenlast_assistant_token_is_eoscandidate_length
model_attnmodel_input_idsr   r  r  selected_tokenscandidate_new_tokens	n_matchesvalid_tokensnew_cur_lennew_cache_size	added_lenr_  s4                                                      @rR   r  z!GenerationMixin.assisted_decodingT  sa   j (>?? 	534O0 0@/K++QdQfQf)6)BH[H]H]1B1N--ThTjTj'3'?||TE[Eh'3'?||TE[Eh#(<fggglC(( 	*(>LQ]Qiel<88;;I<LMMMos)6)BH^Hl!2!>DDZDl 	 %9$D  $J`Ju 	
 '2 $#'? 	  0LML$;\@Q\RRX\"9Z>OZ22VZ'> bCW b^b # 	t{'E 	Vg!q.?!@!D!D\!R!R!RmqH\f./33ODDDbf "
  )}}Y_Q-?@@FFqII $A&1 /3<BBDDDD&4@5CAFLLNNNN	 A  	 #l	  +0,>P7YssVY*Z*Z*]*]^g^n*o*o' 7DM<MNNNN*//11S88  ob)G
 #,3CDDEE 18 18.,>>#/0K#LQ#OPe#f#lmo#pL$7$=a$@<$OM$7M>??8J$KM"'/2E"F"FK&-@ 2A/.;3>,89T,U,89T,U	3 3 3// 3B/)+6,89T,U3 3 3// '-@ W2A/.A,89T,U3 3 3//
 3B/BU2V2V/ =T<c89'((1,,?O?O+-D-KAAArSTSTSTH-U@ @+2111b!!!8< 4:111b!!!8DKKPRKSS	&+i1DiPQPQPQSWPWFX0Y_a&b&b&b# '22;..ATAZ[\A]_`2a2a/4778K8U8UVW8X8XYY^^cd^eejjlll 0 3  38//28;ioa>PP !L00"_-@AA
"5aaa:J9JQ9N9P9P6P"Q;1 "d*9/9(45F(G(45F(G*;-A"&  GG #d''1(45F(G*;-A"&  GG ;1 "d*=(45F(G*;-A"&  GG #d+*;-A"&	  G !,<+<q+@+B+B(BCJ#$$q((/00 w wA*:*:;NqqqR_T[^_T_R_O_;`blmnmnmnpqstststmtbu*v*vJqqq!QQQw''=!!A%%/00 t tA*7-8KAAA}QX[\Q\}L\8]_ijkjkjkmnpqpqpqjq_r*s*sJqqq!QQQw''  ["111'7&7!&;&=&=qqq#@AIIbIQQ"'"3E!QQQ'NPQ"R"R"R"Z"Z[\"]"]^bdedede^e"f",QQQ1A0AA0E0G0G-J"K"R"RWY"R"Z"Z $7qqq;K:K:L:L7L#M 0OAAAssF4KKLTTY[T\\_``eeggI + y<L/L/LQ	Iw'81'<==I +111o	Ao+=>L	9l";DDDI#\--//000#/"-K )1_N&;D'BY[i&j&jG#8M.I!JN]^L^9 9L45 C DEEEE44;4447:3@dgj@j7k7k4  1  ' %  We#V#V#V#VyST}AUAU#V#V#VVVVF$L88 +II )AI$ {5 +?,g.FQZ, ,( .B.#6#%15. . .** .B.#.#%15. . .* ( {5 0D173PRY[d1 1-- 1E173H'S\1 1-  CC$+:X D  L
 #.';'?'?aaae$T-3A6::R+55a8899TaT[[	( ($ (++--22)-& ! F33 *%)"! + Yl	\ LLNNN" 	{- 7'!'9*?'9%5*?    5'!1"7	    rQ   )NNNr   )NN)r   FNF)FF)NF)
NNNNNNNNNN)r   r   NNNNNNNNNFNN)NNNNNNNNNFN)NNNNNNNNNNFN)
NNNNNNNNNF)NNNNNNNNNNF)FNNNNNNNNNFN)?rH   rI   rJ   rK   r   r	   rL   r   r   r   strr
   r   rM   r   r   r   r   r   r   r   r   r   staticmethodrd  r   r   r  r  r  r   r+   r  rr   r(  r   rH  r=   rN  rE  rn  rx  r  r  no_gradGenerateOutputr  r(  ContrastiveSearchOutputrs   GreedySearchOutputrt   SampleOutputru   r   BeamSearchOutputrw   BeamSampleOutputrx   rz   r   ry   r  rP   rQ   rR   r   r     s&        .
 
 
 *.&*:>	>0 >0&>0 sm>0 tC$567	>0
 
u|Xc]Del1B,CC	D>0 >0 >0 >0D *.&*:>	` `&` sm` tC$567	`
 
	` ` ` `8XX smX uS$s)^45	X
 
	X X X X& \` "\KSTW=	c3h   J '+ #+/ +/+/ +/ 3,-	+/
 !$+/ +/ +/ 
uc5<&7!88	9+/ +/ +/ +/Z
 
# 
\_ 
kn 
 
 
 
  #(04' '' ' E,-'
 
uc3h/	0' ' ' \'4 { ^b    & $)).# ## 38n# !	#
 #'# 
c3h# # # #J
 
 
$+$ 
$ $ $ $L&!1&DLM^D_&	& & & &^ 266:AEA A+A "A !+	A
 #+C+>S	+I"JA ##67A tCH~.A &el3A )1(>A 
A A A AF!1FNOcFd	   "/1EEF .0DDE 
"$88	9	   2 04!&w! w!<w! el#w! u|,	w!
 w! 
w! w! w! w!r/ / /4/4S> / / / /b, , ,\ U]__ *.8<:><@W[&*7;-16:AEW W&W $$45W ##67	W
 $$89W #+8S%,4Gc4R+S"TW d^W ""34W >*W &el3W )1(>W 
~u//	0W W W _Wr U]__  !)*:>7;<@&*8<,0/3(,26!-1%)!y y#y }y  	y
 ##67y   34y $$89y smy uS$s)^45y $D>y 'tny  ~y "*$y y >*y  TN!y$ 
&(88	9%y y y _y| ;?<@$(&*8<,0/3(,26!-1A A#A ##67A $$89	A
 SMA smA uS$s)^45A $D>A 'tnA  ~A "*$A A >*A 
!5#33	4A A A AL ;?<@7;$(&*8<,0/3(,26!-1X X#X ##67X $$89	X
   34X SMX smX uS$s)^45X $D>X 'tnX  ~X "*$X X >*X  
|U--	.!X X X X| ;?<@$(&*8<,0/3(,26!1 1#1  1 ##67	1
 $$891 SM1 sm1 uS$s)^451 $D>1 'tn1  ~1 "*$1 1 
!11	21 1 1 1J
 ;?<@7;$(&*8<,0/3(,26!H1 H1#H1  H1 ##67	H1
 $$89H1   34H1 SMH1 smH1 uS$s)^45H1 $D>H1 'tnH1  ~H1 "*$H1 H1  
!11	2!H1 H1 H1 H1\
 ;?<@$(&*8<,0/3(,26!y1 y1#y1  y1 ##67	y1
 $$89y1 SMy1 smy1 uS$s)^45y1 $D>y1 'tny1  ~y1 "*$y1 y1 y1 y1 y1~ ;?<@$(&*8<,0/3(,26&*K1 K1#K1 "=K1 ##67	K1
 $$89K1 SMK1 smK1 uS$s)^45K1 $D>K1 'tnK1  ~K1 "*$K1 d^K1 
!11	2K1 K1 K1 K1b
  :>7;<@&*8<,0/3(,26!-1q q#q +q 	q
 ##67q   34q $$89q smq uS$s)^45q $D>q 'tnq  ~q "*$q q >*q q q q q qrQ   r   c           
         g }| j         j        rt          t          |                    D ]o}|                    ||         d         ddddd|ddf         ||         d         ddddd|ddf         ||         d         ||         d         f           pt          |          }n%d| j        j                                        v s2| j         j	        d| j         j	        d                                         v r}t          t          |                    D ]O}|                    ||         d         ddddd|f         ||         d         ddd|ddf         f           Pt          |          }nVd| j        j                                        v s2| j         j	        d| j         j	        d                                         v r| j         j
        r:t          t          |                    D ]}||         ddd|ddf         ||<   nt          t          |                    D ]}||         ddddd|ddf         ||<    nt          t          |                    D ]U}|                    ||         d         ddddd|ddf         ||         d         ddddd|ddf         f           Vt          |          }|S )z9Crops the past key values up to a certain maximum length.r   Nr   r      r[  
gptbigcode)r   r   r  r   r  r  r   rH   r   rc  multi_query)modelr   maximum_lengthnew_pastrB  s        rR   rf  rf    sb   H|& )*_--.. 	 	COO#C(+AAAqqq/>/111,DE#C(+AAAqqq/>/111,DE#C(+#C(+	     //	EO,2244	4	4".7el>XYZ>[>a>a>c>c3c3c_--.. 	 	COO#C(+AAAqqq/>/,AB#C(+AAA,AB     //	17799	9	9".<5<C]^_C`CfCfChCh3h3h<# 	VS1122 S S'6s';AAAPQPQPQ<Q'R$$S S1122 V V'6s';AAAqqq/>/STSTST<T'U$$V _--.. 	 	COO#C(+AAAqqq/>/111,DE#C(+AAAqqq/>/111,DE     //rQ   Fc                 4   t          |           dk    r;d}|D ]&}|r|n|j        d         }||dd|d|f         fz  }'| |fz  } |dz  }||z  }t          |          D ]9}d}|D ],}|r||z   n|j        d         }||d||dz   d|f         fz  }-| |fz  } :| S )z
    Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
    where each member corresponds to a single generated token.
    r   rP   r   .Nr   )r   r   r  )	r   new_outputsr/  r{  r`  	new_tupler  last_dim_sizer  s	            rR   rg  rg  8  s    7||q	  	A 	AE';PGGRM%XgX~~ =>@@III<1W	9    	  	B 	BE+?TGaKKU[QS_M%QQY >?AAIII<NrQ   r  Infr  r  r  filter_valuer  r   c                     |dk    r t          |||          d|           } d|cxk    rdk    rn n t          |||          d|           } | S )a
  
    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering

    Args:
        logits: logits distribution shape (batch size, vocabulary size)
        top_k (`int`, *optional*, defaults to 0):
            If > 0, only keep the top k tokens with highest probability (top-k filtering)
        top_p (`float`, *optional*, defaults to 1.0):
            If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
            filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        min_tokens_to_keep (`int`, *optional*, defaults to 1):
            Minimumber of tokens we keep per batch example in the output.

    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    r   )r  r  r  Nr  )r  r  r  )r6   r7   )r  r  r  r  r  s        rR   top_k_top_p_filteringr  R  s    , qyyp!L]oppp&
 
 	ESp!L]oppp&
 
 MrQ   r  r  next_top_k_probsalpha
beam_widthc                    | |                      dd          z  }||                     dd          z  }t          j        ||                    dd                                        d          }t          j        |d          \  }}	|                    d          }d|z
  |z  ||z  z
  }
t          j        t          j        |
|                    }
|
                    d          \  }	}|S )a  
    Reranks the top_k candidates based on a degeneration penalty (cosine similarity with previous tokens), as described
    in the paper "A Contrastive Framework for Neural Text Generation". Returns the index of the best candidate for each
    row in the batch.
    r   T)r   keepdimr   r   r   r  )	normrL   matmulr`  r  re  r[  r^  r  )r  r  r  r  r  norm_context_hiddennorm_next_hiddencosine_matrixdegeneration_penaltyr  contrastive_scorer  s               rR   r  r  u  s     )>+>+>1d+>+S+SS"[%5%5!T%5%J%JJL!46F6P6PQRTU6V6VWW__`bccM#i2>>>!',,R00u(885CW;WWEK0A:$N$NOO'+++33OA|rQ   r|  )lr  r   r  dataclassesr   typingr   r   r   r   r   r	   r
   r   rL   torch.distributeddistributedr  r   	deepspeedr   modeling_outputsr   r   models.autor   r   r   r   r   utilsr   r   r   beam_constraintsr   r   rw   r   r   r   configuration_utilsr   logits_processr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   rI  r:   r;   r<   r=   r>   modeling_utilsr?   	streamersr@   
get_loggerrH   r  rB   rT   r\   r_   ra   rc   re   rk   rn   rp   r  r  r  r  r  r  rr   r   rf  rg  r(  rO   r   r  r  rP   rQ   rR   <module>r     sT  "    ! ! ! ! ! ! S S S S S S S S S S S S S S S S S S S S                    2 2 2 2 2 2 F F F F F F F F              7 6 6 6 6 6 6 6 6 6 F F F F F F F F R R R R R R R R R R 1 1 1 1 1 1                                                       8               (000000''''''		H	%	% D D D D DK D D D6 #L #L #L #L #LK #L #L #LL D D D D D D D D8 &L &L &L &L &L{ &L &L &LR D D D D Dk D D D8 'L 'L 'L 'L 'L 'L 'L 'LT D D D D D+ D D DD .L .L .L .L .L[ .L .L .Lb D D D D D+ D D DD -L -L -L -L -L[ -L -L -L` ;=ZZ[ /1HHI79TTU 79TTU  EGi ij )<9IK[]ttu, , , , ,\ , , ,"]@ ]@ ]@ ]@ ]@ ]@ ]@ ]@@A- - -`   8  5<<-        	 
          F%" ' 	
       rQ   