fix(metadata): changes for the previous PR (#24)

iterative · Dec 8, 2024 · eb083aa · eb083aa
1 parent 9467166
commit eb083aa
Showing 1 changed file with 70 additions and 75 deletions.
diff --git a/formats/json-metadata-tutorial.ipynb b/formats/json-metadata-tutorial.ipynb
@@ -1767,78 +1767,63 @@
     "        For further information visit https://errors.pydantic.dev/2.8/v/list_type\n",
     "```\n",
     "\n",
-    "##### Indeed, we can verify that schema in 2017 COCO detections changes around entry #36335 through the _instances_val2017.json_ file:\n",
-    "\n",
-    "```python\n",
-    "    DataChain.from_storage(detections_uri).show_json_schema(model_name=\"Narrative\", jmespath=\"annotations[0]\").exec()\n",
-    "\n",
-    "                >>>\n",
-    "                class Instance(BaseModel):\n",
-    "                    segmentation: List[List[float]]\n",
-    "                    area: float\n",
-    "                    iscrowd: int\n",
-    "                    image_id: int\n",
-    "                    bbox: List[float]\n",
-    "                    category_id: int\n",
-    "                    id: int\n",
-    "```\n",
-    "\n",
-    "```python\n",
-    "    DataChain.from_storage(detections_uri).show_json_schema(model_name=\"Narrative\", jmespath=\"annotations[36336]\").exec()\n",
-    "\n",
-    "                >>>\n",
-    "                class Segmentation(BaseModel):\n",
-    "                    counts: List[int]\n",
-    "                    size: List[int]\n",
-    "                \n",
-    "                \n",
-    "                class Instance(BaseModel):\n",
-    "                    segmentation: Segmentation\n",
-    "                    area: int\n",
-    "                    iscrowd: int\n",
-    "                    image_id: int\n",
-    "                    bbox: List[int]\n",
-    "                    category_id: int\n",
-    "                    id: int\n",
-    "\n",
-    "```\n",
-    "\n",
-    "To avoid this problem, we will just ignore the last 446 detected instances with argument `nrows`\n",
-    "\n",
-    "##### Object name collisions\n",
-    "\n",
-    "If we examing the schema for JSON detections, we will see it carries metadata in array 'annotations' – which is named identical to an array with captions. To avoid namespace collisions, let us rename the detected object instances using an `object_name` argument:"
+    "Indeed, we can verify that schema in 2017 COCO detections changes around entry `#36335` through the `_instances_val2017.json_` file. To avoid this problem, we will just ignore the last 446 detected instances with argument `nrows`.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 46,
    "id": "41b38187",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Preparing: 1 rows [00:00, 637.72 rows/s]\n",
-      "Download: 19.1MB [00:01, 16.3MB/s]\n",
-      "Processed: 1 rows [00:00, 1061.31 rows/s]\n",
-      "Cleanup: 2 tables [00:00, 4760.84 tables/s]\n",
-      "Preparing: 1 rows [00:00, 801.36 rows/s]\n",
+      "Preparing: 1 rows [00:00, 651.09 rows/s]\n",
       "Download: 19.1MB [00:01, 18.0MB/s]\n",
-      "Processed: 1 rows [00:00, 1090.56 rows/s]\n",
-      "Cleanup: 2 tables [00:00, 3506.94 tables/s]\n"
+      "Processed: 1 rows [00:00, 972.71 rows/s]\n",
+      "Cleanup: 2 tables [00:00, 3731.59 tables/s]\n",
+      "Preparing: 1 rows [00:00, 907.47 rows/s]\n",
+      "Download: 19.1MB [00:01, 17.5MB/s]\n",
+      "Processed: 1 rows [00:00, 1029.02 rows/s]\n",
+      "Cleanup: 2 tables [00:00, 7667.83 tables/s]"
      ]
     },
     {
-     "ename": "TypeError",
-     "evalue": "'lineterm' is an invalid keyword argument for split()",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[43], line 21\u001b[0m\n\u001b[1;32m     13\u001b[0m \u001b[38;5;66;03m# Print JSON schema for annotations[36336]\u001b[39;00m\n\u001b[1;32m     14\u001b[0m after \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\n\u001b[1;32m     15\u001b[0m     DataChain\n\u001b[1;32m     16\u001b[0m         \u001b[38;5;241m.\u001b[39mfrom_storage(detections_uri)\n\u001b[1;32m     17\u001b[0m         \u001b[38;5;241m.\u001b[39mprint_json_schema(model_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNarrative\u001b[39m\u001b[38;5;124m\"\u001b[39m, jmespath\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mannotations[36336]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     18\u001b[0m         \u001b[38;5;241m.\u001b[39mcollect(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmeta_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     19\u001b[0m )\n\u001b[0;32m---> 21\u001b[0m sys\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mwritelines(unified_diff(before\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m), \u001b[43mafter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msplit\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlineterm\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m))\n",
-      "\u001b[0;31mTypeError\u001b[0m: 'lineterm' is an invalid keyword argument for split()"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--- \n",
+      "+++ \n",
+      "@@ -3,12 +3,17 @@\n",
+      " from datachain.lib.meta_formats import UserModel\n",
+      " \n",
+      " \n",
+      "+class Segmentation(UserModel):\n",
+      "+    counts: list[int]\n",
+      "+    size: list[int]\n",
+      "+\n",
+      "+\n",
+      " class Narrative(UserModel):\n",
+      "-    segmentation: list[list[float]]\n",
+      "-    area: float\n",
+      "+    segmentation: Segmentation\n",
+      "+    area: int\n",
+      "     iscrowd: int\n",
+      "     image_id: int\n",
+      "-    bbox: list[float]\n",
+      "+    bbox: list[int]\n",
+      "     category_id: int\n",
+      "     id: int\n",
+      " \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
      ]
     }
    ],
@@ -1847,49 +1832,59 @@
     "from difflib import unified_diff\n",
     "\n",
     "\n",
-    "# Print JSON schema for annotations[0]\n",
+    "# Get JSON schema for annotations[0]\n",
     "before = (next(\n",
     "    DataChain\n",
     "        .from_storage(detections_uri)\n",
     "        .print_json_schema(model_name=\"Narrative\", jmespath=\"annotations[0]\")\n",
     "        .collect(\"meta_schema\")\n",
     "))\n",
     "\n",
-    "# Print JSON schema for annotations[36336]\n",
+    "# Get JSON schema for annotations[36336]\n",
     "after = next(\n",
     "    DataChain\n",
     "        .from_storage(detections_uri)\n",
     "        .print_json_schema(model_name=\"Narrative\", jmespath=\"annotations[36336]\")\n",
     "        .collect(\"meta_schema\")\n",
     ")\n",
     "\n",
-    "sys.stdout.writelines(unified_diff(before.split(\"\\n\"), after.split(\"\\n\"), lineterm=\"\"))"
+    "sys.stdout.writelines(unified_diff(before.splitlines(keepends=True)[5:], after.splitlines(keepends=True)[5:]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "663b092a",
+   "metadata": {},
+   "source": [
+    "##### Object name collisions\n",
+    "\n",
+    "If we examing the schema for JSON detections, we will see it carries metadata in array 'annotations' – which is named identical to an array with captions. To avoid namespace collisions, let us rename the detected object instances using an `object_name` argument:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 47,
    "id": "46c51457-8137-440e-93b9-0e858e9f257e",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Preparing: 1 rows [00:00, 787.37 rows/s]\n",
-      "Download: 19.1MB [00:01, 17.6MB/s]\n",
-      "Processed: 1 rows [00:00, 1338.75 rows/s]\n",
-      "Cleanup: 2 tables [00:00, 7463.17 tables/s]\n",
+      "Preparing: 1 rows [00:00, 671.20 rows/s]\n",
+      "Download: 19.1MB [00:01, 16.2MB/s]\n",
+      "Processed: 1 rows [00:00, 1100.29 rows/s]\n",
+      "Cleanup: 2 tables [00:00, 5332.87 tables/s]\n",
       "Processed: 0 rows [00:00, ? rows/s]\n",
-      "Download: 3.69MB [00:00, 19.6MB/s]\n",
-      "Processed: 1 rows [00:00,  2.02 rows/s]\n",
-      "Generated: 5000 rows [00:00, 34404.75 rows/s]\n",
+      "Download: 3.69MB [00:00, 23.9MB/s]\n",
+      "Processed: 1 rows [00:00,  2.07 rows/s]\n",
+      "Generated: 5000 rows [00:00, 35158.09 rows/s]\n",
       "Processed: 0 rows [00:00, ? rows/s]\n",
       "\u001b[A\n",
       "\u001b[A\n",
-      "Download: 3.69MB [00:00, 8.00MB/s]\n",
-      "Processed: 1 rows [00:00,  1.23 rows/s]\n",
-      "Generated: 25014 rows [00:00, 54395.43 rows/s]\n",
+      "Download: 3.69MB [00:00, 7.74MB/s]\n",
+      "Processed: 1 rows [00:00,  1.19 rows/s]\n",
+      "Generated: 25014 rows [00:00, 52029.02 rows/s]\n",
       "Processed: 0 rows [00:00, ? rows/s]\n",
       "\u001b[A\n",
       "\u001b[A\n",
@@ -1905,10 +1900,10 @@
       "\u001b[A\n",
       "\u001b[A\n",
       "\u001b[A\n",
-      "Download: 19.1MB [00:03, 6.09MB/s]\n",
-      "Processed: 1 rows [00:03,  3.80s/ rows]\n",
-      "Generated: 36335 rows [00:02, 16778.30 rows/s]\n",
-      "Cleanup: 16 tables [00:00, 1774.80 tables/s]\n"
+      "Download: 19.1MB [00:03, 6.33MB/s]\n",
+      "Processed: 1 rows [00:03,  3.55s/ rows]\n",
+      "Generated: 36335 rows [00:02, 17672.36 rows/s]\n",
+      "Cleanup: 21 tables [00:00, 2035.69 tables/s]\n"
      ]
     }
    ],