Skip to content

Commit

Permalink
FIX: define traindataset for codefeedback and WizLMinstruct dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
5eqn committed Jan 7, 2025
1 parent fdf36d2 commit e31e861
Showing 1 changed file with 2 additions and 0 deletions.
2 changes: 2 additions & 0 deletions examples/corda_finetuning/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3):
selected_data_dict = (
load_dataset("iboing/CodeFeedback-Filtered-Instruction", split="train").shuffle(seed=seed).take(nsamples)
)
traindataset = []
for example in selected_data_dict:
if example.get("input", "") == "":
s = llama_chat_format.format(instruction=example["query"], response=example["answer"])
Expand All @@ -181,6 +182,7 @@ def get_calib_data(name, tokenizer, model_id, nsamples, seqlen=2048, seed=3):
selected_data_dict = (
load_dataset("iboing/WizardLM_evol_instruct_V2_143k", split="train").shuffle(seed=seed).take(nsamples)
)
traindataset = []
for example in selected_data_dict:
if example.get("input", "") == "":
s = llama_chat_format.format(
Expand Down

0 comments on commit e31e861

Please sign in to comment.