#llm having existential crisis after the second input message

1 messages · Page 1 of 1 (latest)

tranquil beacon
#

the first message that I sent to the llm is being handled just fine, but after that it starts spouting massive nonsense. Whats wrong with my code?

import os
import logging
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoProcessor, LlavaForConditionalGeneration, TextIteratorStreamer, BitsAndBytesConfig
import torch
from collections import deque
import asyncio

# FastAPI app setup
app = FastAPI()

# Define the incoming request schema
class SpeechInput(BaseModel):
    speaker: str
    text: str

# Setup logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s [%(levelname)s] %(message)s")

# Constants
MODEL_ID = os.getenv("LLAVA_MODEL_ID", "CreitinGameplays/pixtral-1")
QUANT_CONFIG = BitsAndBytesConfig(load_in_8bit=True)
HISTORY_LEN = 20

# Initialize model and processor
logging.debug("Loading model and processor...")
try:
    model = LlavaForConditionalGeneration.from_pretrained(
        MODEL_ID,
        quantization_config=QUANT_CONFIG,
        device_map="auto"
    )
    processor = AutoProcessor.from_pretrained(MODEL_ID)
    processor.patch_size = model.config.vision_config.patch_size
    processor.vision_feature_select_strategy = model.config.vision_feature_select_strategy
    logging.debug("Model and processor loaded successfully.")
except Exception as e:
    logging.exception("Failed to load model and processor.")
    raise e

# History buffer for keeping track of previous messages
chat_history = deque(maxlen=HISTORY_LEN)

async def generate_response_from_llm(text: str):
    """
    Generates a response from the LLM using the provided text input.
    """
    logging.debug(f"Generating response for: {text}")

    # Define the system prompt, this can be modified based on your desired behavior
    system_prompt = "You are Pixel, a cute little AI girl. You livestream sometimes and change moods occasionally. Your responses are rather short and never longer than two sentences."

    # Prepare input for the model, including the system prompt and user message
    conversation = [{"role": "system", "content": system_prompt}]
    conversation.append({"role": "user", "content": text})
    conversation.extend(chat_history)  # Add history if necessary

    prompt = processor.apply_chat_template(conversation, tokenize=False)
    inputs = processor(text=prompt, return_tensors="pt").to(model.device)

    for k, v in inputs.items():
        if isinstance(v, torch.Tensor) and v.is_floating_point():
            inputs[k] = v.half()

    # Generate a response from the model
    streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {
        **inputs,
        "streamer": streamer,
        "max_new_tokens": 512,
        "do_sample": True,
        "temperature": 0.7,
        "repetition_penalty": 1.1,
        "top_p": 0.95,
        "top_k": 50
    }

    try:
        # Perform inference in a separate thread to avoid blocking
        import threading
        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()

        response = ""
        for new_text in streamer:
            logging.debug(f"LLM response: {new_text}")
            response += new_text

        # Append model response to history
        chat_history.append({"role": "assistant", "content": response})
        return response
    except Exception as e:
        logging.error(f"Error generating response: {e}")
        return "Sorry, I couldn't generate a response."```
#
# Define the FastAPI route to process speech input
@app.post("/process_speech/")
async def process_speech(input: SpeechInput):
    """
    Endpoint to receive speech input, generate a response from the LLM, and return the response.
    """
    # Log the received speech input for debugging
    logging.info(f"Received speech input: {input.speaker} - {input.text}")

    try:
        # Generate a response from the LLM
        response_text = await generate_response_from_llm(input.text)
        logging.info(f"Generated response: {response_text}")
    except Exception as e:
        logging.error(f"Error generating response: {e}")
        response_text = "Sorry, there was an error generating a response."

    return {"status": "success", "response": response_text}```
tranquil beacon
#

anyone?

#

pls