Training model in stream from datasets / no need to download them
opened this issue · comments
Deleted user commented
can it be bused to do training on coding model and code from coding model to training as input ?
from datasets import load_dataset
from bitsandbytes.optim import Adam8bit
gpt.gradient_checkpointing_enable()
codeparrot = load_dataset("transformersbook/codeparrot-train", streaming=True)
optimizer = Adam8bit(gpt.parameters(), lr=1e-5)
with torch.cuda.amp.autocast():
for row in tqdm(codeparrot["train"]):
if len(row["content"]) <= 1:
continue
batch = tokenizer(row["content"], truncation=True, max_length=128, return_tensors='pt')
batch = {k: v.cuda() for k, v in batch.items()}
out = gpt.forward(**batch,)
loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0, -2), batch['input_ids'][:, 1:].flatten(),
reduction='mean')
print(loss)
loss.backward()
optimizer.step()
optimizer.zero_grad()
idea from Fine-tuning
HuggingFace 🤗 Datasets library