Skip to content

Commit

Permalink
FIX Make CorDA example work (#2300)
Browse files Browse the repository at this point in the history
  • Loading branch information
5eqn authored Jan 7, 2025
1 parent fdf36d2 commit d967f63
Showing 1 changed file with 1 addition and 3 deletions.
4 changes: 1 addition & 3 deletions examples/corda_finetuning/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def get_qat_dataset(name, tokenizer, data_percent):
def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3):
print(f" get_data_from: {name}, nsamples={nsamples}, seqlen={seqlen}, {seed}")
cache_file = f"cache/{name}_{model_id.replace('/','_')}_{nsamples}_{seqlen}_{seed}.pt"
traindataset = []
if not os.path.exists("cache"):
os.makedirs("cache")
if os.path.exists(cache_file):
Expand Down Expand Up @@ -139,7 +140,6 @@ def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3):
tot_text = "\n\n".join(traindata["question"])
elif name == "alpaca":
selected_data_dict = load_dataset("iboing/alpaca_data", split="train").shuffle(seed=seed).take(nsamples)
traindataset = []
for example in selected_data_dict:
if example.get("input", "") == "":
s = llama_chat_format.format(instruction=example["instruction"], response=example["output"])
Expand All @@ -152,7 +152,6 @@ def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3):
return traindataset
elif name == "MetaMATH":
selected_data_dict = load_dataset("iboing/MetaMathQA-395K", split="train").shuffle(seed=seed).take(nsamples)
traindataset = []
for example in selected_data_dict:
if example.get("input", "") == "":
s = llama_chat_format.format(instruction=example["query"], response=example["response"])
Expand Down Expand Up @@ -196,7 +195,6 @@ def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3):
else:
raise NotImplementedError
print(f"tot_text={len(tot_text)}")
traindataset = []
for _ in range(nsamples):
i = random.randint(0, len(tot_text) - seqlen - 1)
j = i + seqlen * 10
Expand Down

0 comments on commit d967f63

Please sign in to comment.