the first message that I sent to the llm is being handled just fine, but after that it starts spouting massive nonsense. Whats wrong with my code?
import os
import logging
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoProcessor, LlavaForConditionalGeneration, TextIteratorStreamer, BitsAndBytesConfig
import torch
from collections import deque
import asyncio
# FastAPI app setup
app = FastAPI()
# Define the incoming request schema
class SpeechInput(BaseModel):
speaker: str
text: str
# Setup logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s")
# Constants
MODEL_ID = os.getenv("LLAVA_MODEL_ID", "CreitinGameplays/pixtral-1")
QUANT_CONFIG = BitsAndBytesConfig(load_in_8bit=True)
HISTORY_LEN = 20
# Initialize model and processor
logging.debug("Loading model and processor...")
try:
model = LlavaForConditionalGeneration.from_pretrained(
MODEL_ID,
quantization_config=QUANT_CONFIG,
device_map="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)
processor.patch_size = model.config.vision_config.patch_size
processor.vision_feature_select_strategy = model.config.vision_feature_select_strategy
logging.debug("Model and processor loaded successfully.")
except Exception as e:
logging.exception("Failed to load model and processor.")
raise e
# History buffer for keeping track of previous messages
chat_history = deque(maxlen=HISTORY_LEN)
async def generate_response_from_llm(text: str):
"""
Generates a response from the LLM using the provided text input.
"""
logging.debug(f"Generating response for: {text}")
# Define the system prompt, this can be modified based on your desired behavior
system_prompt = "You are Pixel, a cute little AI girl. You livestream sometimes and change moods occasionally. Your responses are rather short and never longer than two sentences."
# Prepare input for the model, including the system prompt and user message
conversation = [{"role": "system", "content": system_prompt}]
conversation.append({"role": "user", "content": text})
conversation.extend(chat_history) # Add history if necessary
prompt = processor.apply_chat_template(conversation, tokenize=False)
inputs = processor(text=prompt, return_tensors="pt").to(model.device)
for k, v in inputs.items():
if isinstance(v, torch.Tensor) and v.is_floating_point():
inputs[k] = v.half()
# Generate a response from the model
streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": 512,
"do_sample": True,
"temperature": 0.7,
"repetition_penalty": 1.1,
"top_p": 0.95,
"top_k": 50
}
try:
# Perform inference in a separate thread to avoid blocking
import threading
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
response = ""
for new_text in streamer:
logging.debug(f"LLM response: {new_text}")
response += new_text
# Append model response to history
chat_history.append({"role": "assistant", "content": response})
return response
except Exception as e:
logging.error(f"Error generating response: {e}")
return "Sorry, I couldn't generate a response."```