#got an unexpected keyword argument 'num_items_in_batch'

2 messages · Page 1 of 1 (latest)

trim ore
#
from datasets import disable_caching, load_dataset, Dataset, Features, Value
from tqdm.auto import tqdm


# Initialize empty fallback dataset
cleaned_dataset = Dataset.from_dict({"content": []})

print("Loading dataset...")

try:
    # Define a union schema covering all columns from any batch.
    union_schema = Features({
        "document_id": Value("string"),
        "filing_date": Value("string"),   # present in some batches
        "title": Value("string"),           # present in some batches
        "chunk_index": Value("int64"),      # present in some batches
        "chunk_id": Value("int64"),         # present in some batches
        "problem_id": Value("int64"),       # present in some batches
        "sample_id": Value("int64"),        # newly added field
        "article_id": Value("int64"),       # newly added field
        "thread_id": Value("int64"),        # newly added field
        "source": Value("string"),          # from pdf_chunks and others
        "content": Value("string")
    })
    
    # Load the dataset using the union schema.
    dataset = load_dataset("itztheking/FQwen-1.0", split="train[1000:2000]", features=union_schema)
    
    # Remove all extra columns so that only "content" remains.
    cleaned_dataset = dataset.remove_columns([
        "document_id", "filing_date", "title",
        "chunk_index", "chunk_id", "problem_id",
        "sample_id", "article_id", "thread_id", "source"
    ])
    
    print(f"Successfully created dataset with {len(cleaned_dataset)} examples")
    
except Exception as e:
    print(f"Error loading dataset: {e}")

# Display a sample (first 200 characters)
if len(cleaned_dataset) > 0:
    sample = cleaned_dataset[0]["content"]
    print("\nSample content:")
    print(sample[:200] + "..." if len(sample) > 200 else sample)

# Prepare for formatting: ensure your tokenizer is defined with an eos_token.
EOS_TOKEN = tokenizer.eos_token

# Define a formatting function to process batches.
def formatting_prompts_func(batch):
    # Append the EOS token to each content string and create a new "text" field.
    return {"text": [content + EOS_TOKEN for content in batch["content"]]}

print("\nFormatting dataset for pretraining...")
# Use the .map() method with batched processing and a progress bar.
formatted_dataset = cleaned_dataset.map(
    formatting_prompts_func,
    batched=True,
)

# Optionally, if you only need the "text" column, remove the original "content" column.
formatted_dataset = formatted_dataset.remove_columns(["content"])

print(f"\nDataset formatted with {len(formatted_dataset)} examples")
if len(formatted_dataset) > 0:
    sample_text = formatted_dataset[0]["text"]
    print(f"Sample formatted text: {sample_text[:100]}...")

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 16,

    args = UnslothTrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 16,

        warmup_ratio = 0.1,
        num_train_epochs = 1,

        learning_rate = 5e-5,
        embedding_learning_rate = 5e-6,

        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.00,
        lr_scheduler_type = "cosine",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb", # Use this for WandB etc
    ),
)

trainer_stats = trainer.train()
#

Error:
TypeError: Qwen2_5_VLForConditionalGeneration.forward() got an unexpected keyword argument 'num_items_in_batch'