Cosmos Nemotron release

NVlabs · Jan 7, 2025 · 33b4f1a · 33b4f1a
1 parent 37ae801
commit 33b4f1a
Show file tree

Hide file tree

Showing 284 changed files with 568 additions and 0 deletions.
diff --git a/data_prepare/coyo/coyo_downloader.py b/data_prepare/coyo/coyo_downloader.py
@@ -136,3 +136,5 @@ async def main(data_list):
 
 
 asyncio.run(main(metadata_list))
+
+
diff --git a/data_prepare/coyo/coyo_splitter.py b/data_prepare/coyo/coyo_splitter.py
@@ -46,3 +46,5 @@
             f.write(str(len(samples2write)))
 
         counter += 1
+
+
diff --git a/data_prepare/mmc4/mmc4_downloader.py b/data_prepare/mmc4/mmc4_downloader.py
@@ -151,3 +151,5 @@ async def main(data_list):
 
 
 asyncio.run(main(all_data))
+
+
diff --git a/data_prepare/mmc4/mmc4_filter_and_counter.py b/data_prepare/mmc4/mmc4_filter_and_counter.py
@@ -56,3 +56,5 @@
 
     with open(os.path.join(output_path, pkl.replace(".pkl", ".count")), "w") as f:
         f.write(str(len(filtered_annotation)))
+
+
diff --git a/data_prepare/mmc4/mmc4_merger.py b/data_prepare/mmc4/mmc4_merger.py
@@ -46,3 +46,5 @@
 
     with open(os.path.join(output_path, shard_name.replace(".jsonl", ".pkl")), "wb") as f:
         pickle.dump(data_list, f)
+
+
diff --git a/data_prepare/panda_split.py b/data_prepare/panda_split.py
@@ -107,3 +107,5 @@ def split_video_to_clips(
     import fire
 
     fire.Fire(split_video_to_clips)
+
+
diff --git a/data_prepare/sft/ART1_2.py b/data_prepare/sft/ART1_2.py
@@ -86,3 +86,5 @@ def coords_list2bbox(coords_list: List[List[int]], width: int, height: int) -> s
         jsonl_file.write("\n")  # Add a newline after each JSON object
 
 print("Processing complete.")
+
+
diff --git a/data_prepare/sft/ESTVQA.py b/data_prepare/sft/ESTVQA.py
@@ -33,3 +33,5 @@ def is_english(text):
             jsonl_file.write("\n")
 
 print("Processing complete.")
+
+
diff --git a/data_prepare/sft/LSVT.py b/data_prepare/sft/LSVT.py
@@ -84,3 +84,5 @@ def coords_list2bbox(coords_list: List[List[int]], width: int, height: int) -> s
             jsonl_file.write("\n")  # Add a newline after each JSON object
 
 print("Processing complete.")
+
+
diff --git a/data_prepare/sft/POIE.py b/data_prepare/sft/POIE.py
@@ -85,3 +85,5 @@ def coords_list2bbox(coords_list: List[List[int]], width: int, height: int) -> s
                 jsonl_file.write("\n")  # Add a newline after each JSON object
 
 print("Processing complete.")
+
+
diff --git a/data_prepare/sft/ReCTS.py b/data_prepare/sft/ReCTS.py
@@ -105,3 +105,5 @@ def coords_list2bbox(coords_list: List[List[int]], width: int, height: int) -> s
             jsonl_file.write("\n")  # Add a newline after each JSON object
 
 print("Processing complete.")
+
+
diff --git a/data_prepare/sft/SROIE.py b/data_prepare/sft/SROIE.py
@@ -66,3 +66,5 @@
 
 # Now 'images' contains all the opened images from the image_root directory
 print(f"Successfully opened {len(images)} out of {len(image_files)} images.")
+
+
diff --git a/data_prepare/sft/merge_idefics2.py b/data_prepare/sft/merge_idefics2.py
@@ -47,3 +47,5 @@ def load_jsonl(file_path):
     for item in all_data:
         json.dump(item, f)
         f.write("\n")
+
+
diff --git a/data_prepare/sft/merge_llava_onevision.py b/data_prepare/sft/merge_llava_onevision.py
@@ -99,3 +99,5 @@ def load_jsonl(file_path):
     import fire
 
     fire.Fire(main)
+
+
diff --git a/data_prepare/sft/merge_llava_onevision_eagle.py b/data_prepare/sft/merge_llava_onevision_eagle.py
@@ -83,3 +83,5 @@ def load_jsonl(file_path):
     import fire
 
     fire.Fire(main)
+
+
diff --git a/data_prepare/sft/mtwi.py b/data_prepare/sft/mtwi.py
@@ -148,3 +148,5 @@ def clip(x):
         jsonl_file.write("\n")  # Add a newline after each JSON object
 
 print("Processing complete.")
+
+
diff --git a/data_prepare/sft/preprocess_art_shangy.py b/data_prepare/sft/preprocess_art_shangy.py
@@ -87,3 +87,5 @@ def convert_txt_to_jsonl(input_file, output_file):
     output_file = "./art500k_processed.jsonl"
     base_path = "./"
     convert_txt_to_jsonl(input_file, output_file)
+
+
diff --git a/data_prepare/sft/preprocess_cambrian.py b/data_prepare/sft/preprocess_cambrian.py
@@ -70,3 +70,5 @@ def check_sample(sample):
 
 with open("cambrian_doc_1275k.json", "w") as f:
     json.dump(cambrian_doc_1275k, f)
+
+
diff --git a/data_prepare/sft/preprocess_cambrian_eagle.py b/data_prepare/sft/preprocess_cambrian_eagle.py
@@ -75,3 +75,5 @@ def check_sample(sample):
 
 with open(os.path.join(base_path, "cambrian_adlr_train.json"), "w") as f:
     json.dump(cambrian_eagle, f)
+
+
diff --git a/data_prepare/sft/preprocess_docreason.py b/data_prepare/sft/preprocess_docreason.py
@@ -29,3 +29,5 @@
 
 with open(json_file_processed, "w") as f:
     json.dump(records, f)
+
+
diff --git a/data_prepare/sft/preprocess_flan.py b/data_prepare/sft/preprocess_flan.py
@@ -70,3 +70,5 @@
 
 with open(os.path.join(save_path, "text_flan_1m.pkl"), "wb") as f:
     pickle.dump(filtered_samples, f)
+
+
diff --git a/data_prepare/sft/preprocess_idefics2.py b/data_prepare/sft/preprocess_idefics2.py
@@ -130,3 +130,5 @@ def process_dataset(args):
     # Map the process_dataset function to the arguments
     for _ in tqdm(pool.imap_unordered(process_dataset, args), total=len(args), desc="Processing datasets"):
         pass
+
+
diff --git a/data_prepare/sft/preprocess_idefics2_eagle.py b/data_prepare/sft/preprocess_idefics2_eagle.py
@@ -170,3 +170,5 @@ def main(
     import fire
 
     fire.Fire(main)
+
+
diff --git a/data_prepare/sft/preprocess_kvqa.py b/data_prepare/sft/preprocess_kvqa.py
@@ -45,3 +45,5 @@
     json.dump(new_records, f)
 
 print(len(new_records))
+
+
diff --git a/data_prepare/sft/preprocess_llava_onevision.py b/data_prepare/sft/preprocess_llava_onevision.py
@@ -103,3 +103,5 @@ def main(
     import fire
 
     fire.Fire(main)
+
+
diff --git a/data_prepare/sft/preprocess_m3it.py b/data_prepare/sft/preprocess_m3it.py
@@ -82,3 +82,5 @@
     save_filename = os.path.join(save_path, save_filename)
     with open(save_filename, "wb") as f:
         pickle.dump(dataset, f)
+
+
diff --git a/data_prepare/sft/preprocess_metamathqa.py b/data_prepare/sft/preprocess_metamathqa.py
@@ -25,3 +25,5 @@
 
 with open(json_file_processed, "w") as f:
     json.dump(records, f)
+
+
diff --git a/data_prepare/sft/preprocess_viquae.py b/data_prepare/sft/preprocess_viquae.py
@@ -45,3 +45,5 @@ def base64_to_pil_image(base64_string):
 
 with open(os.path.join(base_path, "viquae_processed.json"), "w") as f:
     json.dump(new_records, f)
+
+
diff --git a/data_prepare/sft/split_vflan.py b/data_prepare/sft/split_vflan.py
@@ -57,3 +57,5 @@
     print(f"Finished writing part-{counter:05d}.pkl!")
 
     counter += 1
+
+
diff --git a/data_prepare/sft/unichart_pretrain.py b/data_prepare/sft/unichart_pretrain.py
@@ -57,3 +57,5 @@
                 pbar.update(1)
 
 print("Processing complete.")
+
+
diff --git a/data_prepare/sft/unichart_sft.py b/data_prepare/sft/unichart_sft.py
@@ -59,3 +59,5 @@
                 pbar.update(1)
 
 print("Processing complete.")
+
+
diff --git a/llava/__init__.py b/llava/__init__.py
@@ -1,2 +1,4 @@
 from .entry import *
 from .media import *
+
+
diff --git a/llava/cli/eval.py b/llava/cli/eval.py
@@ -198,3 +198,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/llava/cli/infer.py b/llava/cli/infer.py
@@ -42,3 +42,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/llava/cli/run.py b/llava/cli/run.py
@@ -132,3 +132,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/llava/cli/upload2hf.py b/llava/cli/upload2hf.py
@@ -222,3 +222,5 @@ def main():
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/llava/constants.py b/llava/constants.py
@@ -30,3 +30,5 @@
     "image": "<image>",
     "video": "<vila/video>",
 }
+
+
diff --git a/llava/conversation.py b/llava/conversation.py
@@ -189,3 +189,5 @@ def auto_set_conversation_mode(model_name_or_path: str) -> str:
             logger.info(f"Setting conversation mode to `{v}` based on model name/path `{model_name_or_path}`.")
             default_conversation = conv_templates[v]
             return
+
+
diff --git a/llava/data/__init__.py b/llava/data/__init__.py
@@ -3,3 +3,5 @@
 from .dataset_impl import *
 from .datasets_mixture import *
 from .simple_vila_webdataset import VILAWebDataset
+
+
diff --git a/llava/data/base.py b/llava/data/base.py
@@ -84,3 +84,5 @@ def __getitem__(self, index: int) -> Dict[str, Any]:
 
     def __len__(self) -> int:
         return len(self.instances)
+
+
diff --git a/llava/data/builder.py b/llava/data/builder.py
@@ -213,3 +213,5 @@ def build_dataset_legacy(
         data_args=data_args,
         training_args=training_args,
     )
+
+
diff --git a/llava/data/collate.py b/llava/data/collate.py
@@ -88,3 +88,5 @@ def __call__(self, instances: Sequence[Dict[str, Any]]) -> Dict[str, Any]:
             "labels": labels,
             "attention_mask": attention_mask,
         }
+
+
diff --git a/llava/data/dataset.py b/llava/data/dataset.py
@@ -1576,3 +1576,5 @@ def make_supervised_data_module(
         train_dataset=train_dataset,
         data_collator=data_collator,
     )
+
+
diff --git a/llava/data/dataset_impl/__init__.py b/llava/data/dataset_impl/__init__.py
@@ -2,3 +2,5 @@
 from .lita import *
 from .llava import *
 from .llava_cot import *
+
+
diff --git a/llava/data/dataset_impl/coyo_qa.py b/llava/data/dataset_impl/coyo_qa.py
@@ -203,3 +203,5 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
             data_dict["block_sizes"] = block_sizes
 
         return data_dict
+
+
diff --git a/llava/data/dataset_impl/coyo_recap.py b/llava/data/dataset_impl/coyo_recap.py
@@ -88,3 +88,5 @@ def __init__(
         else:
             self.caption_choice = data_args.caption_choice
         print(f"Current caption choice: {self.caption_choice}.")
+
+
diff --git a/llava/data/dataset_impl/dummy.py b/llava/data/dataset_impl/dummy.py
@@ -92,3 +92,5 @@ def process(self, instance: Dict[str, Any]) -> List[Dict[str, Any]]:
         # Add media to the beginning of the first message
         messages[0]["value"] = medias + [messages[0]["value"]]
         return messages
+
+
diff --git a/llava/data/dataset_impl/lita.py b/llava/data/dataset_impl/lita.py
@@ -241,3 +241,5 @@ def process(self, instance: Dict[str, Any]) -> List[Dict[str, Any]]:
         video = Video(instance["video_path"])
         messages[0]["value"] = [video, messages[0]["value"]]
         return messages
+
+
diff --git a/llava/data/dataset_impl/llava.py b/llava/data/dataset_impl/llava.py
@@ -132,3 +132,5 @@ def process(self, instance: Dict[str, Any]) -> List[Dict[str, Any]]:
             new_value = [*img_list, value.replace(DEFAULT_IMAGE_TOKEN, "").strip()]
             messages[0]["value"] = new_value
         return messages
+
+
diff --git a/llava/data/dataset_impl/llava_cot.py b/llava/data/dataset_impl/llava_cot.py
@@ -174,3 +174,5 @@ def process_multi_img(self, instance: Dict[str, Any], index: int) -> List[Dict[s
     assert len(medias) == 0, f"#Num of <images> does not match the number of images in the instance. {instance}"
 
     return messages
+
+
diff --git a/llava/data/dataset_impl/panda70m.py b/llava/data/dataset_impl/panda70m.py
@@ -229,3 +229,5 @@ def cleanup_corrupted_videos(
     jinfo = json.load(open(json_path))
     img_t = load_video(video_path, jinfo=jinfo)
     print(img_t)
+
+
diff --git a/llava/data/dataset_impl/sam.py b/llava/data/dataset_impl/sam.py
@@ -230,3 +230,5 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
     for idx, data in enumerate(dst):
         print(idx, data.keys())
 # nvcode: off
+
+
diff --git a/llava/data/dataset_impl/textocr.py b/llava/data/dataset_impl/textocr.py
@@ -289,3 +289,5 @@ def __getitem__(self, index):
 
     for idx in range(2):
         pprint(dataset[idx])
+
+
diff --git a/llava/data/dataset_impl/utils.py b/llava/data/dataset_impl/utils.py
@@ -27,3 +27,5 @@ def _remove_media_tokens(text: str) -> str:
     for token in ["<image>", "<video>"]:
         text = text.replace(token + "\n", "").replace("\n" + token, "").replace(token, "")
     return text.strip()
+
+
diff --git a/llava/data/datasets_mixture.py b/llava/data/datasets_mixture.py
@@ -54,3 +54,5 @@ def add_dataset(dataset):
 
 def register_datasets_mixtures():
     pass
+
+
diff --git a/llava/data/simple_vila_webdataset.py b/llava/data/simple_vila_webdataset.py
@@ -334,3 +334,5 @@ def merge(a: dict, b: dict, path=[], strict=False):
 
             # if idx >= 5:
             #     break
+
+
diff --git a/llava/data/utils.py b/llava/data/utils.py
@@ -35,3 +35,5 @@ def from_bytesio(cls, file_path: str, decode_audio: bool = True, decoder: str =
             print(f"unsupported type {type(file_path)}")
         video_cls = select_video_class(decoder)
         return video_cls(video_file, pathlib.Path(file_path).name, decode_audio)
+
+
diff --git a/llava/entry.py b/llava/entry.py
@@ -36,3 +36,5 @@ def load(
 
     model = load_pretrained_model(model_path, model_name, model_base, **kwargs)[1]
     return model
+
+
diff --git a/llava/eval/__init__.py b/llava/eval/__init__.py
@@ -7,3 +7,5 @@
 
 EVAL_ROOT = "scripts/eval"
 TASKS = io.load(os.path.join(os.path.dirname(__file__), "registry.yaml"))
+
+
diff --git a/llava/eval/cinepile.py b/llava/eval/cinepile.py
@@ -106,3 +106,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/llava/eval/egoschema.py b/llava/eval/egoschema.py
@@ -107,3 +107,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/llava/eval/eval_refcoco.py b/llava/eval/eval_refcoco.py
@@ -71,3 +71,5 @@ def computeIoU(bbox1, bbox2):
             except Exception as e:
                 print(e, flush=True)
                 continue
+
+
diff --git a/llava/eval/eventbench.py b/llava/eval/eventbench.py
@@ -91,3 +91,5 @@ def main() -> None:
 
 if __name__ == "__main__":
     main()
+
+
diff --git a/llava/eval/lmms/models/__init__.py b/llava/eval/lmms/models/__init__.py
@@ -1,3 +1,5 @@
 AVAILABLE_MODELS = {
     "vila_internal": "VILA",
 }
+
+
diff --git a/llava/eval/lmms/models/vila_internal.py b/llava/eval/lmms/models/vila_internal.py
@@ -139,3 +139,5 @@ def generate_until_multi_round(self, requests: List[Instance]) -> List[str]:
 
     def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
         raise NotImplementedError
+
+
diff --git a/llava/eval/lmms/tasks/__init__.py b/llava/eval/lmms/tasks/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/llava/eval/lmms/tasks/videomme.py b/llava/eval/lmms/tasks/videomme.py
@@ -64,3 +64,5 @@ def videomme_doc_to_text_subtitle(doc: Dict[str, Any], num_frames: int) -> str:
     prompt += "\n".join(doc["options"]) + "\n"
     prompt += "The best answer is:"
     return prompt
+
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -136,3 +136,5 @@ async def main(data_list):


		asyncio.run(main(metadata_list))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -46,3 +46,5 @@
		f.write(str(len(samples2write)))

		counter += 1
Original file line number	Diff line number	Diff line change
Expand Up		@@ -151,3 +151,5 @@ async def main(data_list):


		asyncio.run(main(all_data))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -56,3 +56,5 @@

		with open(os.path.join(output_path, pkl.replace(".pkl", ".count")), "w") as f:
		f.write(str(len(filtered_annotation)))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -46,3 +46,5 @@

		with open(os.path.join(output_path, shard_name.replace(".jsonl", ".pkl")), "wb") as f:
		pickle.dump(data_list, f)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -107,3 +107,5 @@ def split_video_to_clips(
		import fire

		fire.Fire(split_video_to_clips)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -86,3 +86,5 @@ def coords_list2bbox(coords_list: List[List[int]], width: int, height: int) -> s
		jsonl_file.write("\n") # Add a newline after each JSON object

		print("Processing complete.")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -33,3 +33,5 @@ def is_english(text):
		jsonl_file.write("\n")

		print("Processing complete.")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -84,3 +84,5 @@ def coords_list2bbox(coords_list: List[List[int]], width: int, height: int) -> s
		jsonl_file.write("\n") # Add a newline after each JSON object

		print("Processing complete.")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -85,3 +85,5 @@ def coords_list2bbox(coords_list: List[List[int]], width: int, height: int) -> s
		jsonl_file.write("\n") # Add a newline after each JSON object

		print("Processing complete.")