from datasets import disable_caching, load_dataset, Dataset, Features, Value
from tqdm.auto import tqdm
cleaned_dataset = Dataset.from_dict({"content": []})
print("Loading dataset...")
try:
union_schema = Features({
"document_id": Value("string"),
"filing_date": Value("string"),
"title": Value("string"),
"chunk_index": Value("int64"),
"chunk_id": Value("int64"),
"problem_id": Value("int64"),
"sample_id": Value("int64"),
"article_id": Value("int64"),
"thread_id": Value("int64"),
"source": Value("string"),
"content": Value("string")
})
dataset = load_dataset("itztheking/FQwen-1.0", split="train[1000:2000]", features=union_schema)
cleaned_dataset = dataset.remove_columns([
"document_id", "filing_date", "title",
"chunk_index", "chunk_id", "problem_id",
"sample_id", "article_id", "thread_id", "source"
])
print(f"Successfully created dataset with {len(cleaned_dataset)} examples")
except Exception as e:
print(f"Error loading dataset: {e}")
if len(cleaned_dataset) > 0:
sample = cleaned_dataset[0]["content"]
print("\nSample content:")
print(sample[:200] + "..." if len(sample) > 200 else sample)
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(batch):
return {"text": [content + EOS_TOKEN for content in batch["content"]]}
print("\nFormatting dataset for pretraining...")
formatted_dataset = cleaned_dataset.map(
formatting_prompts_func,
batched=True,
)
formatted_dataset = formatted_dataset.remove_columns(["content"])
print(f"\nDataset formatted with {len(formatted_dataset)} examples")
if len(formatted_dataset) > 0:
sample_text = formatted_dataset[0]["text"]
print(f"Sample formatted text: {sample_text[:100]}...")
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
trainer = UnslothTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = formatted_dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 16,
args = UnslothTrainingArguments(
per_device_train_batch_size = 1,
gradient_accumulation_steps = 16,
warmup_ratio = 0.1,
num_train_epochs = 1,
learning_rate = 5e-5,
embedding_learning_rate = 5e-6,
fp16 = not is_bfloat16_supported(),
bf16 = is_bfloat16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.00,
lr_scheduler_type = "cosine",
seed = 3407,
output_dir = "outputs",
report_to = "wandb",
),
)
trainer_stats = trainer.train()