Skip to content

Commit

Permalink
fix(metadata): changes for the previous PR (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
shcheklein authored Dec 8, 2024
1 parent 9467166 commit eb083aa
Showing 1 changed file with 70 additions and 75 deletions.
145 changes: 70 additions & 75 deletions formats/json-metadata-tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1767,78 +1767,63 @@
" For further information visit https://errors.pydantic.dev/2.8/v/list_type\n",
"```\n",
"\n",
"##### Indeed, we can verify that schema in 2017 COCO detections changes around entry #36335 through the _instances_val2017.json_ file:\n",
"\n",
"```python\n",
" DataChain.from_storage(detections_uri).show_json_schema(model_name=\"Narrative\", jmespath=\"annotations[0]\").exec()\n",
"\n",
" >>>\n",
" class Instance(BaseModel):\n",
" segmentation: List[List[float]]\n",
" area: float\n",
" iscrowd: int\n",
" image_id: int\n",
" bbox: List[float]\n",
" category_id: int\n",
" id: int\n",
"```\n",
"\n",
"```python\n",
" DataChain.from_storage(detections_uri).show_json_schema(model_name=\"Narrative\", jmespath=\"annotations[36336]\").exec()\n",
"\n",
" >>>\n",
" class Segmentation(BaseModel):\n",
" counts: List[int]\n",
" size: List[int]\n",
" \n",
" \n",
" class Instance(BaseModel):\n",
" segmentation: Segmentation\n",
" area: int\n",
" iscrowd: int\n",
" image_id: int\n",
" bbox: List[int]\n",
" category_id: int\n",
" id: int\n",
"\n",
"```\n",
"\n",
"To avoid this problem, we will just ignore the last 446 detected instances with argument `nrows`\n",
"\n",
"##### Object name collisions\n",
"\n",
"If we examing the schema for JSON detections, we will see it carries metadata in array 'annotations' – which is named identical to an array with captions. To avoid namespace collisions, let us rename the detected object instances using an `object_name` argument:"
"Indeed, we can verify that schema in 2017 COCO detections changes around entry `#36335` through the `_instances_val2017.json_` file. To avoid this problem, we will just ignore the last 446 detected instances with argument `nrows`.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 46,
"id": "41b38187",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Preparing: 1 rows [00:00, 637.72 rows/s]\n",
"Download: 19.1MB [00:01, 16.3MB/s]\n",
"Processed: 1 rows [00:00, 1061.31 rows/s]\n",
"Cleanup: 2 tables [00:00, 4760.84 tables/s]\n",
"Preparing: 1 rows [00:00, 801.36 rows/s]\n",
"Preparing: 1 rows [00:00, 651.09 rows/s]\n",
"Download: 19.1MB [00:01, 18.0MB/s]\n",
"Processed: 1 rows [00:00, 1090.56 rows/s]\n",
"Cleanup: 2 tables [00:00, 3506.94 tables/s]\n"
"Processed: 1 rows [00:00, 972.71 rows/s]\n",
"Cleanup: 2 tables [00:00, 3731.59 tables/s]\n",
"Preparing: 1 rows [00:00, 907.47 rows/s]\n",
"Download: 19.1MB [00:01, 17.5MB/s]\n",
"Processed: 1 rows [00:00, 1029.02 rows/s]\n",
"Cleanup: 2 tables [00:00, 7667.83 tables/s]"
]
},
{
"ename": "TypeError",
"evalue": "'lineterm' is an invalid keyword argument for split()",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[43], line 21\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# Print JSON schema for annotations[36336]\u001b[39;00m\n\u001b[1;32m 14\u001b[0m after \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\n\u001b[1;32m 15\u001b[0m DataChain\n\u001b[1;32m 16\u001b[0m \u001b[38;5;241m.\u001b[39mfrom_storage(detections_uri)\n\u001b[1;32m 17\u001b[0m \u001b[38;5;241m.\u001b[39mprint_json_schema(model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNarrative\u001b[39m\u001b[38;5;124m\"\u001b[39m, jmespath\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mannotations[36336]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 18\u001b[0m \u001b[38;5;241m.\u001b[39mcollect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 19\u001b[0m )\n\u001b[0;32m---> 21\u001b[0m sys\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mwritelines(unified_diff(before\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m), \u001b[43mafter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlineterm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m))\n",
"\u001b[0;31mTypeError\u001b[0m: 'lineterm' is an invalid keyword argument for split()"
"name": "stdout",
"output_type": "stream",
"text": [
"--- \n",
"+++ \n",
"@@ -3,12 +3,17 @@\n",
" from datachain.lib.meta_formats import UserModel\n",
" \n",
" \n",
"+class Segmentation(UserModel):\n",
"+ counts: list[int]\n",
"+ size: list[int]\n",
"+\n",
"+\n",
" class Narrative(UserModel):\n",
"- segmentation: list[list[float]]\n",
"- area: float\n",
"+ segmentation: Segmentation\n",
"+ area: int\n",
" iscrowd: int\n",
" image_id: int\n",
"- bbox: list[float]\n",
"+ bbox: list[int]\n",
" category_id: int\n",
" id: int\n",
" \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
Expand All @@ -1847,49 +1832,59 @@
"from difflib import unified_diff\n",
"\n",
"\n",
"# Print JSON schema for annotations[0]\n",
"# Get JSON schema for annotations[0]\n",
"before = (next(\n",
" DataChain\n",
" .from_storage(detections_uri)\n",
" .print_json_schema(model_name=\"Narrative\", jmespath=\"annotations[0]\")\n",
" .collect(\"meta_schema\")\n",
"))\n",
"\n",
"# Print JSON schema for annotations[36336]\n",
"# Get JSON schema for annotations[36336]\n",
"after = next(\n",
" DataChain\n",
" .from_storage(detections_uri)\n",
" .print_json_schema(model_name=\"Narrative\", jmespath=\"annotations[36336]\")\n",
" .collect(\"meta_schema\")\n",
")\n",
"\n",
"sys.stdout.writelines(unified_diff(before.split(\"\\n\"), after.split(\"\\n\"), lineterm=\"\"))"
"sys.stdout.writelines(unified_diff(before.splitlines(keepends=True)[5:], after.splitlines(keepends=True)[5:]))"
]
},
{
"cell_type": "markdown",
"id": "663b092a",
"metadata": {},
"source": [
"##### Object name collisions\n",
"\n",
"If we examing the schema for JSON detections, we will see it carries metadata in array 'annotations' – which is named identical to an array with captions. To avoid namespace collisions, let us rename the detected object instances using an `object_name` argument:"
]
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 47,
"id": "46c51457-8137-440e-93b9-0e858e9f257e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Preparing: 1 rows [00:00, 787.37 rows/s]\n",
"Download: 19.1MB [00:01, 17.6MB/s]\n",
"Processed: 1 rows [00:00, 1338.75 rows/s]\n",
"Cleanup: 2 tables [00:00, 7463.17 tables/s]\n",
"Preparing: 1 rows [00:00, 671.20 rows/s]\n",
"Download: 19.1MB [00:01, 16.2MB/s]\n",
"Processed: 1 rows [00:00, 1100.29 rows/s]\n",
"Cleanup: 2 tables [00:00, 5332.87 tables/s]\n",
"Processed: 0 rows [00:00, ? rows/s]\n",
"Download: 3.69MB [00:00, 19.6MB/s]\n",
"Processed: 1 rows [00:00, 2.02 rows/s]\n",
"Generated: 5000 rows [00:00, 34404.75 rows/s]\n",
"Download: 3.69MB [00:00, 23.9MB/s]\n",
"Processed: 1 rows [00:00, 2.07 rows/s]\n",
"Generated: 5000 rows [00:00, 35158.09 rows/s]\n",
"Processed: 0 rows [00:00, ? rows/s]\n",
"\u001b[A\n",
"\u001b[A\n",
"Download: 3.69MB [00:00, 8.00MB/s]\n",
"Processed: 1 rows [00:00, 1.23 rows/s]\n",
"Generated: 25014 rows [00:00, 54395.43 rows/s]\n",
"Download: 3.69MB [00:00, 7.74MB/s]\n",
"Processed: 1 rows [00:00, 1.19 rows/s]\n",
"Generated: 25014 rows [00:00, 52029.02 rows/s]\n",
"Processed: 0 rows [00:00, ? rows/s]\n",
"\u001b[A\n",
"\u001b[A\n",
Expand All @@ -1905,10 +1900,10 @@
"\u001b[A\n",
"\u001b[A\n",
"\u001b[A\n",
"Download: 19.1MB [00:03, 6.09MB/s]\n",
"Processed: 1 rows [00:03, 3.80s/ rows]\n",
"Generated: 36335 rows [00:02, 16778.30 rows/s]\n",
"Cleanup: 16 tables [00:00, 1774.80 tables/s]\n"
"Download: 19.1MB [00:03, 6.33MB/s]\n",
"Processed: 1 rows [00:03, 3.55s/ rows]\n",
"Generated: 36335 rows [00:02, 17672.36 rows/s]\n",
"Cleanup: 21 tables [00:00, 2035.69 tables/s]\n"
]
}
],
Expand Down

0 comments on commit eb083aa

Please sign in to comment.