# установка нужных библиотек
# !pip install datasets
# !pip install vllm

import torch
import transformers

from datasets import Dataset, load_dataset
from enum import Enum


from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor

from pprint import pprint
from pydantic import BaseModel

print(transformers.__version__)
print(torch.__version__)

4.50.0
2.6.0+cu124

# Возьмем китайскую модель qwen-2.5
# подробнее: https://github.com/QwenLM/Qwen2.5
model_name = "Qwen/Qwen2.5-3B"

# Инициализируем модель
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")

# Инициализируем токенизатор
tokenizer = AutoTokenizer.from_pretrained(model_name)

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

# Входной текст (промпт)
inputs = "План изучения машинного обучения:\n"
# Применение токенизатора
tokenized_inputs = tokenizer(inputs, return_tensors="pt")

print("Вход модели:")
print(inputs)
print("Токенизированный вход:")
pprint(tokenized_inputs)

Вход модели:
План изучения машинного обучения:

Токенизированный вход:
{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[ 16854, 131972,  23064,  28371,  18492, 130839,  38800, 143181,    510]])}

# Возьмем GPU, на котором "живет" модель
device = model.device

# Перенесем тензоры на GPU
input_ = {key: value.to(device) for key, value in tokenized_inputs.items()}

# Прогоним модель
outputs = model(**input_)

logits = outputs.logits  # Тензор с "сырыми" предсказаниями (логитами)
probabilities = torch.softmax(logits, dim=-1)  # Вероятности токенов
next_token_id = torch.argmax(  # Возьмем токен с макисмальной вероятностью
    probabilities[:, -1, :], dim=-1, keepdim=True
)
next_token_id

tensor([[16]], device='cuda:0')

# Количество токенов, которое будем генерировать
num_tokens_to_generate = 128

for step in range(num_tokens_to_generate):
    # Прямой проход модели
    outputs = model(**input_)

    # Получаем логиты для следующего токена
    next_token_logits = outputs.logits[:, -1, :]  # [batch_size, vocab_size]
    # скрытое состояние посл-го токена ----^

    # Жадный выбор: токен с максимальной вероятностью
    next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)  # [batch_size, 1]

    # Добавляем новый токен к входным данным
    input_["input_ids"] = torch.cat([input_["input_ids"], next_token], dim=-1)

    # Обновляем маску внимания
    input_["attention_mask"] = torch.cat(
        [
            input_["attention_mask"],
            # Единичный тензор размера (1, 1) того же типа и на той же GPU
            torch.ones(
                (1, 1),
                dtype=input_["attention_mask"].dtype,
                device=input_["attention_mask"].device,
            ),
        ],
        dim=-1,
    )

    # Если достигли конца текста, останавливаемся
    if next_token == tokenizer.eos_token_id:
        break

# Декодируем результат
generated_text = tokenizer.decode(input_["input_ids"][0], skip_special_tokens=True)
print(f"Сгенерированный текст:\n\n{generated_text}")

Сгенерированный текст:

План изучения машинного обучения:
1. Основы машинного обучения
2. Структуры данных и алгоритмы машинного обучения
3. Статистика и математика для машинного обучения
4. Структуры данных и алгоритмы машинного обучения
5. Статистика и математика для машинного обучения
6. Структуры данных и алгоритмы машинного обучения
7. Статистика и математика для машинного обучения
8. Структуры данных и алгоритмы машинного обучения

generated_ids = model.generate(
    **tokenized_inputs.to(model.device),
    max_new_tokens=num_tokens_to_generate,  # Максимальная длина сгенерированного текста
    do_sample=False,  # Отключаем случайность (жадный выбор)
    pad_token_id=tokenizer.eos_token_id,  # Выставим паддинг токен как токен конца генерации
)

# Декодируем обратно в текст
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
print(f"Сгенерированный текст:\n\n{generated_text}")

Сгенерированный текст:

План изучения машинного обучения:
1. Основы машинного обучения
2. Структуры данных и алгоритмы машинного обучения
3. Статистика и математика для машинного обучения
4. Структуры данных и алгоритмы машинного обучения
5. Статистика и математика для машинного обучения
6. Структуры данных и алгоритмы машинного обучения
7. Статистика и математика для машинного обучения
8. Структуры данных и алгоритмы машинного обучения

generated_ids = model.generate(
    **tokenized_inputs.to(model.device),
    max_new_tokens=num_tokens_to_generate,  # Максимальная длина текста
    do_sample=True,  # Будем использовать сэмплирование
    temperature=1.05,  # Увеличим температуру, сделаем ответы разнообразнее
    pad_token_id=tokenizer.eos_token_id,  # Чтобы избежать предупреждений
)

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
print(f"Сгенерированный текст:\n\n{generated_text}")

Сгенерированный текст:

План изучения машинного обучения:
Классификация: включает разбиение данных данных набор, где по некоторому вкладывает гипотезу
Регрессия: прогнозирование вещества, например, на основе данных набор и прогнозирования, которые будут иметь место в будущем
Анализ сообщений: подразумевает прогнозирование или извлечение данных из текстовых данных<|endoftext|>

# Обратите внимание, будем использовать Instruct-версию модели!
model_name = "Qwen/Qwen2.5-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

prompt = "Что такое relu в глубоком обучении?"

messages = [
    {
        "role": "system",
        "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
    },
    {"role": "user", "content": prompt},
]

tokenizer.chat_template

'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n\' }}\n    {%- else %}\n        {{- \'<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n\' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}\n        {{- \'<|im_start|>\' + message.role + \'\\n\' + message.content + \'<|im_end|>\' + \'\\n\' }}\n    {%- elif message.role == "assistant" %}\n        {{- \'<|im_start|>\' + message.role }}\n        {%- if message.content %}\n            {{- \'\\n\' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- \'\\n<tool_call>\\n{"name": "\' }}\n            {{- tool_call.name }}\n            {{- \'", "arguments": \' }}\n            {{- tool_call.arguments | tojson }}\n            {{- \'}\\n</tool_call>\' }}\n        {%- endfor %}\n        {{- \'<|im_end|>\\n\' }}\n    {%- elif message.role == "tool" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}\n            {{- \'<|im_start|>user\' }}\n        {%- endif %}\n        {{- \'\\n<tool_response>\\n\' }}\n        {{- message.content }}\n        {{- \'\\n</tool_response>\' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}\n            {{- \'<|im_end|>\\n\' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|im_start|>assistant\\n\' }}\n{%- endif %}\n'

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,  # не будем токенизировать, а только форматируем
    add_generation_prompt=True,  # затравка для генерации
)
text

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nЧто такое relu в глубоком обучении?<|im_end|>\n<|im_start|>assistant\n'

model_inputs = tokenizer(text, return_tensors="pt").to(model.device)
model_inputs

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  72819,  24634, 134322,
          92874,   5805, 132853,  63469,  14746,  12228, 143180,  83098,     30,
         151645,    198, 151644,  77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

generated_ids = model.generate(**model_inputs, do_sample=True, max_new_tokens=1024)

generated_ids

tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,  72819,  24634, 134322,
          92874,   5805, 132853,  63469,  14746,  12228, 143180,  83098,     30,
         151645,    198, 151644,  77091,    198,  79652,    320,   4415,   1870,
          28263,   7954,      8,   1959,  67879, 136109,  23064, 135041, 133051,
          43686,  54713,  12141, 130304,  53586,   5805,  44816,   7599,  24634,
           6020,  43686,   6709,  21032, 129568,  43686, 137528, 129250,   7587,
         130599,  63833, 126202,   6709,  21032, 129568,  43686, 137528,  21032,
             13, 134948, 126068,  42965, 137677, 130193,   1032,  23236,   1447,
             16,     13,  34348,   2247,  46705,  43758, 142567,   4235,    510,
            256,   1032,  23236, 141107, 133104,  25460,  18943,  21032,  76395,
          54713,  11916,     11, 130407, 133026,  26909, 137514,   7587,   5805,
          73615,  72285,  91357,  70021,  61391,     11,  46253,  98745, 142189,
          54908,  33513,     11,   7587,    220,     15,     11,  46253,  98745,
          20264,  30103,  10373,  59890,  33513,    382,     17,     13,  66791,
          31230,   3780,  30343,    510,    256,  17767,    282,   2075,      8,
            284,   1124,   2810,      7,     15,     11,    856,      8,   1124,
            692,     18,     13, 134004,  16104,  85530,   1478,    510,    256,
            481,  79484,  24276,  13039,   1478,  20811, 135958,   3038,  61391,
             25,  37660,  50645,  43767, 143180,  17175,   7587,  56825,   2247,
          51574,  26619,    624,    256,    481,  44358,  51329, 132775,  63262,
          60290,   1792,  60279, 130297,   4793,  46705,  18492,   5409, 126964,
          16339,    330,  88454, 125153,  22496,   4235,      1,    320,  15649,
          10976,  20169,    701, 131222,  36305, 132853,  63469,  14746,  66645,
          34775,  60290,  10474,    624,    256,    481,  27499,  22621, 138121,
          99047,  67642,  52276,     11,  47389,  66988,  13932, 135334,  17787,
         143180,  17175,    382,     19,     13,  34789,  13103,  24276,   7972,
          16748,    510,    256,    481, 143685,  12281,  28156,  43055,   4793,
          17998,    330,   8959,  37622,   4552,   1792,  50636,  87512,      1,
            320,     89,  23342,  92495,   1648,  87512,     11, 128548, 130356,
         130414,    220,     15,     11, 129348,  18658,  54423, 137673,   8215,
          56825,   4552,   6949,  42193,  66645, 137232, 126202, 143181,    624,
            256,    481,  34789,   8178,  14497,   2184,  14005,   6020, 124448,
          24725, 125879,   1802,  18673,   4552,     25, 126231,  91778, 131705,
          38800, 140974,  18492,  88663,  16748,     11,  46253,  65359, 132399,
           1478, 140662,  20264,  30103,  10373,  59890, 125439,     11,  24725,
         125879,   1802,  18673,  66988,  61631, 128632, 143936,     11,  47389,
          66988,  12281,  26991,  88238,   7665,    330, 124818,  14949,  22787,
           3780,      1, 131480,   6715,    382,     20,     13,  42796,  32693,
          32195,  26619,    510,    256,   1032,  23236, 135927, 143248,  51670,
          20396,  52528,   7599, 127682, 124237, 141204, 126242,  47162,  38888,
          45077,  10813,   7336, 130534,   2247,     12, 133486,   1802,   5805,
          44816,   7599,  24634,   6020,  43686,   6709,  21032, 129568,  43686,
         137528, 129250,   7587, 128488,  43116,   9923,  68675,  70729,  10474,
           5409, 142746,  63833,  49707,  54713,  12141, 130304,  53586,    382,
             21,     13,  79698,  46705,  61676,    510,    256,  22933, 128714,
         131662, 129612,  70139,  43000,  10474, 131704, 139193,  20562, 131939,
          53586,   1032,  23236,     11, 132842,  51670,   1967,  28100,   1032,
          23236,     11,   6991,  16340,   1032,  23236,   7587,  17258,     52,
             11, 128548, 126091, 129371, 139437,  23064, 127782,  24276,   7972,
          89777, 140982,  28018,  38800,   1032,  23236,    382,  79652, 139427,
         137255,  38888, 131964,  11916, 137908,  14062,   1802, 132348,  13039,
          14949,  19849, 141132,   7587, 143181,   6709,  21032, 129568,  43686,
         137528,  21032, 138425, 131189, 132717,  67642,   7587, 129020,  13039,
           1504,     13, 151645]], device='cuda:0')

generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs["input_ids"], generated_ids)
]

# batch_decode для декодирования батча, в данном случае батч размера 1
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

'ReLU (Rectified Linear Unit) — это одна из самых популярных функций активации в сверточных нейронных сетях и других типах нейронных сетей. Вот основные характеристики ReLU:\n\n1. Основная идея:\n   ReLU представляет собой линейную функцию, которая принимает вход и возвращает его без изменений, если он положительный, и 0, если он отрицательный.\n\n2. Формула:\n   \\( f(x) = \\max(0, x) \\)\n\n3. Преимущества:\n   - Простота вычислений: быстрое обучение и предсказание.\n   - Меньшая вероятность столкновения с проблемой "размытия" (vanishing gradient), особенно при глубоких слоях.\n   - Снижение размерности данных, что может ускорить обучение.\n\n4. Недостатки:\n   - Может пропускать "забытые значения" (zombie activations): значения, которые всегда будут 0, могут нести информацию о предыдущих этапах обучения.\n   - Нетограниченно градиенты: во время обратного распространения ошибки, если значение выхода становится отрицательным, градиент может быть очень большим, что может привести к "перекосу" весов.\n\n5. Использование:\n   ReLU обычно используется как альтернатива сигмоидной или тангенс-функции в сверточных нейронных сетях и их комбинациях с другими типами функций активации.\n\n6. Обновления:\n   В более современных моделях часто используются вариации ReLU, такие как Leaky ReLU, Parametric ReLU и ELU, которые решают некоторые из недостатков оригинального ReLU.\n\nReLU стало стандартной частью многих библиотек для создания и обучения нейронных сетей благодаря своей эффективности и простоте.'

data = load_dataset("scikit-learn/imdb", split="train").train_test_split(test_size=0.2, seed=42)
data

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: 
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 10000
    })
})

random_sample = data["train"][1137]
random_sample

{'review': 'and it doesn\'t help rohmer\'s case that a few years later Syberberg came along and made a staggeringly great piece of work on the same subject (with a little help from Wagner).<br /><br />maybe this movie didn\'t look so paltry when it came out, without the syberberg film to compare it to, which was probably shot on an even smaller sound stage with fewer resources. I actually can\'t recall at the moment whether there are horses in the syberberg film. all I know is, the German version is pure magic, while this one looks like some college production documented on film for archival purposes.<br /><br />the music... la musique... isn\'t even credited here on IMDb... but someone based it on \'airs from the 12th-14th centuries" or something... well it isn\'t a great help to the film. it comes off as inauthentic and cheesy, comme le frommage mon cher!!!<br /><br />rohmer is one of those french auteurs who likes his leading men generally quite unattractive, too, and that doesn\'t help matters. syberberg\'s Parsifal was adorable, and can be seen on German television today selling some kind of special bicycle he invented. .. .<br /><br />I shudder to think what watching the syberberg on video is like. I remember that the last time I saw the film in a theater, the print was so bad that the experience was a whopping 5 hour travesty. But even then it would have to surpass what this version has to offer, I\'m afraid.<br /><br />points for earnestness, for chutzpah, but... this film simply needed beau-coup more bucks. it doesn\'t look like a medieval manuscript it looks CHEAPO! BON MARCHE!! oh and yeah, it just ends very arbitrarily with Parsifal going to church and this cheesy passion play being interjected... blah!',
 'sentiment': 'negative'}

messages = [{"role": "user", "content": f"Classify this sentiment: {random_sample['review']}"}]

def tokenize_and_apply_chat_template(messages, tokenizer):
    """Токенизация списка сообщений messages и применение чат-шаблона"""
    return tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True,
    )


tokenized_messages = tokenize_and_apply_chat_template(messages, tokenizer).to(model.device)

tokenized_messages

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,
            553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,
             13, 151645,    198, 151644,    872,    198,   1957,   1437,    419,
          25975,     25,    323,    432,   3171,    944,   1492,    926,     71,
           1174,    594,   1142,    429,    264,   2421,   1635,   2937,   5718,
            652,   7725,   3697,   3156,    323,   1865,    264,  39156,  11307,
           2244,   6573,    315,    975,    389,    279,   1852,   3832,    320,
           4197,    264,   2632,   1492,    504,  51375,    568,     27,   1323,
          23976,   1323,   6206,  36760,    419,   5700,   3207,    944,   1401,
            773,  10854,   1539,    979,    432,   3697,    700,     11,   2041,
            279,   6568,    652,   7725,   4531,    311,   9429,    432,    311,
             11,    892,    572,   4658,   6552,    389,    458,   1496,   9155,
           5112,   6430,    448,  16745,   4963,     13,    358,   3520,    646,
            944,  19091,    518,    279,   4445,   3425,   1052,    525,  26830,
            304,    279,   6568,    652,   7725,   4531,     13,    678,    358,
           1414,    374,     11,    279,   5938,   2319,    374,  10526,  10963,
             11,   1393,    419,    825,   5868,   1075,   1045,   7770,   5670,
          26372,    389,   4531,    369,  93947,   9895,  15757,   1323,  23976,
           1323,   6206,   1782,   4627,   1112,   1187,  88569,   1112,   4436,
            944,   1496,  40757,   1588,    389,  86401,   1112,    714,   4325,
           3118,    432,    389,    364,   4720,    504,    279,    220,     16,
             17,    339,     12,     16,     19,    339,  23631,      1,    476,
           2494,   1112,   1632,    432,   4436,    944,    264,   2244,   1492,
            311,    279,   4531,     13,    432,   4041,   1007,    438,    304,
           3242,   4256,    323,  86747,     11,  21572,    512,    504,  52310,
           1615,  22216,  12069,     27,   1323,  23976,   1323,   6206,    299,
             71,   1174,    374,    825,    315,   1846,  41193,  88037,   1723,
            879,  13151,    806,   6388,   2953,   8789,   5008,    650,    266,
          69078,     11,   2238,     11,    323,    429,   3171,    944,   1492,
          12850,     13,   6568,    652,   7725,    594,  52711,    333,    278,
            572,  40608,     11,    323,    646,    387,   3884,    389,   5938,
          12425,   3351,  11236,   1045,   3093,    315,   3281,  34986,    566,
          35492,     13,   5241,    659,     27,   1323,  23976,   1323,   6206,
             40,    557,  56129,    311,   1744,   1128,  10099,    279,   6568,
            652,   7725,    389,   2766,    374,   1075,     13,    358,   6099,
            429,    279,   1537,    882,    358,   5485,    279,   4531,    304,
            264,  26705,     11,    279,   1173,    572,    773,   3873,    429,
            279,   3139,    572,    264,  62869,    220,     20,   6460,  10137,
          40239,     13,   1988,   1496,   1221,    432,   1035,    614,    311,
          52020,   1128,    419,   2319,    702,    311,   3010,     11,    358,
           2776,  16575,  15757,   1323,  23976,   1323,   6206,   7706,    369,
          54249,   2090,     11,    369,    521,  32997,     79,   1466,     11,
            714,   1112,    419,   4531,   4936,   4362,  70906,   1786,  13346,
            803,  47334,     13,    432,   3171,    944,   1401,   1075,    264,
          41008,  46813,    432,   5868,  49521,  84240,      0,    425,    711,
          37499,  43793,   2928,  14019,    323,  21639,     11,    432,   1101,
          10335,   1602,  85657,    448,  52711,    333,    278,   2087,    311,
           8817,    323,    419,  86747,  11677,   1486,   1660,    946,  28303,
           1112,  52561,      0, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

generated_ids = model.generate(
    **tokenized_messages,
    do_sample=False,  # жадная генерация для детерменированности
    max_new_tokens=128  # макс. кол-во новых сгенерированных токенов
)

input_ids_len = len(tokenized_messages["input_ids"][0])
tokenizer.decode(generated_ids[0][input_ids_len:])

'The sentiment expressed in this text is predominantly negative towards Rohmer\'s film. The reviewer criticizes several aspects of the movie:\n\n1. **Comparison to Syberberg\'s Work**: The reviewer suggests that Rohmer\'s film looks dated and inferior when compared to Syberberg\'s "staggeringly great" work, implying that Rohmer\'s film lacks the quality and depth.\n\n2. **Production Quality**: The reviewer describes the film as looking "paltry," "like some college production," and "CHEAPO! BON MARCHE!!" indicating poor production values.\n\n3. **Music**: The music is criticized as inauthentic and'

messages = [
    {
        "role": "user",
        # Добавим пояснение, что ожидаем positive / negative ответ
        "content": f"Classify this sentiment: {random_sample['review']}. Return 'positive' or 'negative':",
    }
]

# Уже знакомый код
tokenized_messages = tokenize_and_apply_chat_template(messages, tokenizer).to(model.device)
generated_ids = model.generate(**tokenized_messages, do_sample=False, max_new_tokens=128)

input_ids_len = len(tokenized_messages["input_ids"][0])
tokenizer.decode(generated_ids[0][input_ids_len:])

'Negative<|im_end|>'

from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

# vLLM поддерживает модели прямо с HF
model = LLM(model="Qwen/Qwen2.5-3B-Instruct", dtype="float16")

INFO 03-28 17:34:25 [__init__.py:239] Automatically detected platform cuda.
WARNING 03-28 17:34:26 [config.py:2614] Casting torch.bfloat16 to torch.float16.
INFO 03-28 17:34:41 [config.py:585] This model supports multiple tasks: {'score', 'embed', 'classify', 'generate', 'reward'}. Defaulting to 'generate'.
WARNING 03-28 17:34:41 [arg_utils.py:1854] Compute Capability < 8.0 is not supported by the V1 Engine. Falling back to V0. 
INFO 03-28 17:34:41 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2) with config: model='Qwen/Qwen2.5-3B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-3B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=False, 
INFO 03-28 17:34:43 [cuda.py:239] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 03-28 17:34:43 [cuda.py:288] Using XFormers backend.
INFO 03-28 17:34:44 [parallel_state.py:954] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 03-28 17:34:44 [model_runner.py:1110] Starting to load model Qwen/Qwen2.5-3B-Instruct...
INFO 03-28 17:34:44 [weight_utils.py:265] Using model weights format ['*.safetensors']

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]

INFO 03-28 17:35:05 [loader.py:447] Loading weights took 20.53 seconds
INFO 03-28 17:35:06 [model_runner.py:1146] Model loading took 5.7916 GB and 21.629146 seconds
INFO 03-28 17:35:15 [worker.py:267] Memory profiling takes 8.99 seconds
INFO 03-28 17:35:15 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 03-28 17:35:15 [worker.py:267] model weights take 5.79GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 2.52GiB; the rest of the memory reserved for KV Cache is 4.90GiB.
INFO 03-28 17:35:16 [executor_base.py:111] # cuda blocks: 8928, # CPU blocks: 7281
INFO 03-28 17:35:16 [executor_base.py:116] Maximum concurrency for 32768 tokens per request: 4.36x
INFO 03-28 17:35:20 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:34<00:00,  1.00it/s]

INFO 03-28 17:35:55 [model_runner.py:1570] Graph capturing finished in 35 secs, took 0.21 GiB
INFO 03-28 17:35:55 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 49.43 seconds

guided_decoding_params = GuidedDecodingParams(choice=["positive", "negative"])
# зададим отдельно параметры сэмплирования
sampling_params = SamplingParams(
    guided_decoding=guided_decoding_params,
    temperature=0.0,  # нулевая температура эквивалентна жадному декодированию
)

messages = [{"role": "user", "content": f"Classify this sentiment: {random_sample['review']}"}]
# обратите внимание, используем метод chat, а не generate!
outputs = model.chat(messages=messages, sampling_params=sampling_params)

INFO 03-28 17:37:12 [chat_utils.py:379] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]/usr/local/lib/python3.11/dist-packages/torch/utils/cpp_extension.py:2059: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
  warnings.warn(
Processed prompts: 100%|██████████| 1/1 [01:35<00:00, 95.27s/it, est. speed input: 4.62 toks/s, output: 0.02 toks/s]

print(outputs[0].outputs[0].text)

negative

# Определяем структуру ответа через Pydantic
class SentimentType(str, Enum):
    positive = "positive"
    negative = "negative"


class ReviewDescription(BaseModel):
    sentiment: SentimentType  # Поле с жестко заданными вариантами


# Автоматически генерируем JSON-схему для валидации:
json_schema = ReviewDescription.model_json_schema()
json_schema

{'$defs': {'SentimentType': {'enum': ['positive', 'negative'],
   'title': 'SentimentType',
   'type': 'string'}},
 'properties': {'sentiment': {'$ref': '#/$defs/SentimentType'}},
 'required': ['sentiment'],
 'title': 'ReviewDescription',
 'type': 'object'}

guided_decoding_params = GuidedDecodingParams(json=json_schema)
sampling_params = SamplingParams(guided_decoding=guided_decoding_params, temperature=0.0)

outputs = model.chat(messages=messages, sampling_params=sampling_params)

WARNING 03-28 17:41:37 [__init__.py:33] xgrammar does not support advanced JSON schema features like enums, patterns or numeric ranges. Falling back to use outlines instead.

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.64it/s, est. speed input: 726.91 toks/s, output: 19.82 toks/s]

print(outputs[0].outputs[0].text)

{ "sentiment": "negative" }

Введение в анализ данных ¶

1. Введение в `transformers`¶

2. Введение в `vLLM`¶

Введение в анализ данных¶

1. Введение в transformers¶

2. Введение в vLLM¶

Введение в анализ данных ¶

1. Введение в `transformers`¶

2. Введение в `vLLM`¶