From 9278a33b00e160fceed68a11d4ff4e19101bca3f Mon Sep 17 00:00:00 2001 From: Ivan Shcheklein Date: Sun, 22 Dec 2024 10:03:17 -0800 Subject: [PATCH] fix(json-metadata): update tutorial accordingly to the recent changes (#25) --- formats/json-metadata-tutorial.ipynb | 611 +++++++++++---------------- 1 file changed, 256 insertions(+), 355 deletions(-) diff --git a/formats/json-metadata-tutorial.ipynb b/formats/json-metadata-tutorial.ipynb index 7d136df..02f5d18 100644 --- a/formats/json-metadata-tutorial.ipynb +++ b/formats/json-metadata-tutorial.ipynb @@ -41,7 +41,7 @@ "- Every sample corresponds to a line in a shared \"JSON lines\" file (.jsonl format).\n", "- Every sample corresponds to an array member inside a common JSON file.\n", "\n", - "JSON files can be large and hard to read on computer screen. To better understand metadata models, DataChain provides functions `print_json_schema()` and `print_jsonl_schema()` which can read and print the JSON layout in Pydantic format. Once the data model is apporoved, data loading and validation is handled by functions `from_json()` and `from_jsonl()`.\n", + "JSON files can be large and hard to read on computer screen. To better understand metadata models, DataChain provides a helper `gen_datamodel_code()` which can read and print the JSON layout in Pydantic format. Once the data model is apporoved, data loading and validation is handled by functions `from_json()`.\n", "\n", "💡 DataChain supports lazy execution. No data is parsed until the results are requested by downstream chains. This means, for example, that validation errors (if any) will not occur immediately but may be triggered by the downstream `exec()`, `count()`, `collect()` or similar actions. When many operations are chained together, it is common to intersperse with `save` operations to cache the intermediate results.\n", "\n", @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "ee62e86a", "metadata": {}, "outputs": [ @@ -69,12 +69,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "id": "430f08e0-e720-4083-b1b4-990882221ef6", "metadata": {}, "outputs": [], "source": [ - "from datachain import C, DataChain" + "from datachain import C, DataChain\n", + "from datachain.lib.meta_formats import gen_datamodel_code" ] }, { @@ -91,10 +92,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e99f53cc-ecef-4549-8233-4d16e83fc207", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ivan/Projects/datachain-examples/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Listing gs://datachain-demo: 400 objects [00:00, 2667.18 objects/s]\n", + "Processed: 1 rows [00:00, 6.13 rows/s]\n", + "Generated: 400 rows [00:00, 42332.50 rows/s]\n", + "Cleanup: 1 tables [00:00, 2933.08 tables/s]\n" + ] + }, { "data": { "text/html": [ @@ -225,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "id": "a901365a-c0a5-48cf-9a5e-acb6967480a9", "metadata": {}, "outputs": [ @@ -237,7 +250,7 @@ "" ] }, - "execution_count": 4, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -252,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "id": "3bed8e31-5e21-45ef-b2f8-896251994ac5", "metadata": {}, "outputs": [ @@ -262,7 +275,7 @@ "'{\"class\": \"cat\", \"id\": \"1009\", \"num_annotators\": 8, \"inference\": {\"class\": \"dog\", \"confidence\": 0.68}}'" ] }, - "execution_count": 5, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -285,27 +298,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "57cb1fb8-c888-48fb-a193-18fefe2475d7", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing: 1 rows [00:00, 557.83 rows/s]\n", - "Download: 102B [00:00, 7.39kB/s]\n", - "Processed: 1 rows [00:00, 731.61 rows/s]\n", - "Cleanup: 2 tables [00:00, 3609.56 tables/s]" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "# generated by datamodel-codegen:\n", "# filename: \n", - "# timestamp: 2024-12-08T01:23:45+00:00\n", + "# timestamp: 2024-12-22T03:09:04+00:00\n", "\n", "from __future__ import annotations\n", "\n", @@ -319,33 +322,24 @@ " confidence: float\n", "\n", "\n", - "class Modeljson4cbc84d5e696474fa07177a4f88a07da(UserModel):\n", + "class Modeljson68097a9478a241e78785787e238c7d02(UserModel):\n", " class_: str = Field(..., alias='class')\n", " id: str\n", " num_annotators: int\n", " inference: Inference\n", "\n", - "DataModel.register(Modeljson4cbc84d5e696474fa07177a4f88a07da)\n", - "spec = Modeljson4cbc84d5e696474fa07177a4f88a07da\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + "DataModel.register(Modeljson68097a9478a241e78785787e238c7d02)\n", + "spec = Modeljson68097a9478a241e78785787e238c7d02\n", "\n" ] } ], "source": [ "\n", - "print(next(\n", - " DataChain\n", - " .from_storage('gs://datachain-demo/dogs-and-cats/cat.1009.json', type='text', anon=True)\n", - " .print_json_schema()\n", - " .collect(\"meta_schema\")\n", - "))" + "\n", + "chain = DataChain.from_storage('gs://datachain-demo/dogs-and-cats/cat.1009.json', anon=True)\n", + "print(gen_datamodel_code(next(chain.collect(\"file\"))))\n", + " " ] }, { @@ -375,29 +369,19 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "9f9c77e2-9e24-43ec-954c-239a797f814e", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing: 1 rows [00:00, 771.44 rows/s]\n", - "Download: 32.0kB [00:00, 901kB/s]\n", - "Processed: 1 rows [00:00, 1212.23 rows/s]\n", - "Cleanup: 2 tables [00:00, 5928.34 tables/s]" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "# generated by datamodel-codegen:\n", "# filename: \n", - "# timestamp: 2024-12-08T01:45:41+00:00\n", + "# timestamp: 2024-12-22T03:22:46+00:00\n", "\n", "from __future__ import annotations\n", "\n", @@ -430,24 +414,21 @@ "spec = Narrative\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ "from datachain import DataChain\n", + "from datachain.lib.meta_formats import gen_datamodel_code\n", + "\n", "uri = \"gs://datachain-demo/openimages-jsonl/open_images_validation_localized_narratives.jsonl\"\n", - "print(next(\n", - " DataChain\n", - " .from_storage(uri, type=\"text\", anon=True)\n", - " .print_jsonl_schema(model_name=\"Narrative\")\n", - " .collect(\"meta_schema\")\n", - "))" + "chain = DataChain.from_storage(uri, type=\"text\", anon=True)\n", + "print(\n", + " gen_datamodel_code(\n", + " next(chain.collect(\"file\")),\n", + " model_name=\"Narrative\",\n", + " format=\"jsonl\"\n", + " )\n", + ")" ] }, { @@ -467,33 +448,19 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "41c68b60-9da9-453a-8d50-c1c49961073a", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Listing gs://datachain-demo: 6 objects [00:00, 60.51 objects/s]\n", - "Processed: 1 rows [00:00, 9.61 rows/s]\n", - "Generated: 6 rows [00:00, 3076.88 rows/s]\n", - "Cleanup: 1 tables [00:00, 2801.81 tables/s]\n", - "Preparing: 1 rows [00:00, 378.21 rows/s]\n", - "Download: 3.69MB [00:01, 2.97MB/s]\n", - "Processed: 1 rows [00:00, 890.70 rows/s]\n", - "Cleanup: 2 tables [00:00, 4000.29 tables/s]" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "# generated by datamodel-codegen:\n", "# filename: \n", - "# timestamp: 2024-12-08T01:47:10+00:00\n", + "# timestamp: 2024-12-22T03:24:24+00:00\n", "\n", "from __future__ import annotations\n", "\n", @@ -543,25 +510,20 @@ "spec = COCO\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ "from datachain import DataChain\n", + "from datachain.lib.meta_formats import gen_datamodel_code\n", "\n", "captions_example_path = \"gs://datachain-demo/coco2017/annotations/captions_val2017.json\"\n", - "print(next(\n", - " DataChain\n", - " .from_storage(captions_example_path, type=\"text\", anon=True)\n", - " .print_json_schema(model_name=\"COCO\")\n", - " .collect(\"meta_schema\")\n", - "))" + "chain = DataChain.from_storage(captions_example_path, type=\"text\", anon=True)\n", + "print(\n", + " gen_datamodel_code(\n", + " next(chain.collect(\"file\")), \n", + " model_name=\"COCO\"\n", + " )\n", + ")" ] }, { @@ -576,27 +538,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "4703f3a5-fd50-4270-9d25-2270a55aa67b", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing: 1 rows [00:00, 979.29 rows/s]\n", - "Download: 3.69MB [00:00, 124MB/s]\n", - "Processed: 1 rows [00:00, 911.61 rows/s]\n", - "Cleanup: 2 tables [00:00, 3407.23 tables/s]" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ "# generated by datamodel-codegen:\n", "# filename: \n", - "# timestamp: 2024-12-08T01:48:10+00:00\n", + "# timestamp: 2024-12-22T03:26:33+00:00\n", "\n", "from __future__ import annotations\n", "\n", @@ -613,24 +565,16 @@ "spec = Annotations\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ - "from datachain import DataChain\n", - "\n", - "print(next(\n", - " DataChain\n", - " .from_storage(captions_example_path, type=\"text\", anon=True)\n", - " .print_json_schema(jmespath=\"annotations\", model_name=\"Annotations\")\n", - " .collect(\"meta_schema\")\n", - "))" + "print(\n", + " gen_datamodel_code(\n", + " next(chain.collect(\"file\")), \n", + " jmespath=\"annotations\", \n", + " model_name=\"Annotations\"\n", + " )\n", + ")" ] }, { @@ -646,7 +590,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "453618d2-5a7d-4501-96f5-4e7eb25e379b", "metadata": { "scrolled": true @@ -656,15 +600,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Preparing: 1 rows [00:00, 502.01 rows/s]\n", - "Download: 102B [00:00, 6.87kB/s]\n", - "Processed: 1 rows [00:00, 782.96 rows/s]\n", - "Cleanup: 2 tables [00:00, 3716.71 tables/s]\n", "Processed: 0 rows [00:00, ? rows/s]\n", - "Download: 102B [00:00, 13.3kB/s]\n", - "Processed: 1 rows [00:00, 3.52 rows/s]\n", - "Generated: 1 rows [00:00, 337.73 rows/s]\n", - "Cleanup: 1 tables [00:00, 2414.68 tables/s]\n" + "Download: 102B [00:00, 40.5kB/s]\n", + "Processed: 1 rows [00:00, 8.18 rows/s]\n", + "Generated: 1 rows [00:00, 531.93 rows/s]\n", + "Cleanup: 1 tables [00:00, 854.06 tables/s]\n" ] }, { @@ -762,7 +702,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "483af1bf-4881-470f-952c-1a2da3c21636", "metadata": { "scrolled": true @@ -772,18 +712,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Preparing: 0 rows [00:00, ? rows/s]" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing: 1 rows [00:00, 186.52 rows/s]\n", - "Download: 1.45kB [00:00, 66.3kB/s]\n", - "Processed: 1 rows [00:00, 737.65 rows/s]\n", - "Cleanup: 2 tables [00:00, 3367.57 tables/s]\n", - "Processed: 2 rows [00:00, 12.75 rows/s]" + "Listing gs://datachain-demo: 400 objects [00:00, 2669.78 objects/s]\n", + "Processed: 1 rows [00:00, 6.02 rows/s]\n", + "Generated: 400 rows [00:00, 29216.38 rows/s]\n", + "Cleanup: 1 tables [00:00, 2519.10 tables/s]\n", + "Processed: 2 rows [00:00, 14.95 rows/s]" ] }, { @@ -796,27 +729,10 @@ " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=nan, input_type=float]\n", - " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processed: 4 rows [00:00, 6.39 rows/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n", "Validation error occurred in row 0 file 348824dd8c0c74e6.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=nan, input_type=float]\n", - " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n", - "Validation error occurred in row 0 file 358315a151efa740.json: 1 validation error for OpenImage\n", - "image_id.Rotation\n", - " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n" ] }, @@ -824,18 +740,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 6 rows [00:00, 6.71 rows/s]" + "Processed: 4 rows [00:00, 10.04 rows/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Validation error occurred in row 0 file 364d0be55f24616a.json: 1 validation error for OpenImage\n", + "Validation error occurred in row 0 file 358315a151efa740.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n", - "Validation error occurred in row 0 file 37b72b4e808bcd30.json: 1 validation error for OpenImage\n", + "Validation error occurred in row 0 file 364d0be55f24616a.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n" @@ -845,31 +761,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 7 rows [00:01, 6.55 rows/s]" + "Processed: 7 rows [00:00, 8.55 rows/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Validation error occurred in row 0 file 384e33c0ffdfe052.json: 1 validation error for OpenImage\n", + "Validation error occurred in row 0 file 37b72b4e808bcd30.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", - " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processed: 8 rows [00:01, 5.36 rows/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Validation error occurred in row 0 file 39224d0d713cb866.json: 1 validation error for OpenImage\n", + " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n", + "Validation error occurred in row 0 file 384e33c0ffdfe052.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n" @@ -879,18 +782,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 10 rows [00:01, 5.17 rows/s]" + "Processed: 9 rows [00:01, 8.16 rows/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Validation error occurred in row 0 file 3b727441da9834b4.json: 1 validation error for OpenImage\n", + "Validation error occurred in row 0 file 39224d0d713cb866.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n", - "Validation error occurred in row 0 file 3cbec6265c443ea4.json: 1 validation error for OpenImage\n", + "Validation error occurred in row 0 file 3b727441da9834b4.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n" @@ -900,14 +803,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "Download: 25.6kB [00:01, 14.8kB/s]\n", - "Processed: 10 rows [00:01, 5.18 rows/s]\n" + "Download: 25.6kB [00:01, 22.8kB/s]ows/s]\n", + "Processed: 10 rows [00:01, 7.80 rows/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Validation error occurred in row 0 file 3cbec6265c443ea4.json: 1 validation error for OpenImage\n", + "image_id.Rotation\n", + " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", + " For further information visit https://errors.pydantic.dev/2.10/v/string_type\n", "Validation error occurred in row 0 file 3fa6819854b27685.json: 1 validation error for OpenImage\n", "image_id.Rotation\n", " Input should be a valid string [type=string_type, input_value=0.0, input_type=float]\n", @@ -918,16 +825,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "Cleanup: 1 tables [00:00, 3771.86 tables/s]\n" + "Cleanup: 1 tables [00:00, 1262.20 tables/s]\n" ] }, { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -950,24 +857,84 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "cd2aa182-c57b-4dc8-9931-fe61988f9fd8", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "'{\\n \"id\": \"3122f16026310c3e\",\\n \"split\": \"test\",\\n \"image_id\": {\\n \"Subset\": \"test\",\\n \"OriginalURL\": \"https://c7.staticflickr.com/3/2275/2407197299_d4ea3fdab2_o.jpg\",\\n \"OriginalLandingURL\": \"https://www.flickr.com/photos/starshaped/2407197299\",\\n \"License\": \"https://creativecommons.org/licenses/by/2.0/\",\\n \"AuthorProfileURL\": \"https://www.flickr.com/people/starshaped/\",\\n \"Author\": \"Aubrey\",\\n \"Title\": \"My worst nightmare\",\\n \"OriginalSize\": 85370,\\n \"OriginalMD5\": \"O+50caXE4Ll4s3pmWcFa2w==\",\\n \"Thumbnail300KURL\": NaN,\\n \"Rotation\": NaN\\n },\\n \"classifications\": [\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/0k0pj\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/03q69\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/02dl1y\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/014sv8\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/0dzct\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/04hgtk\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/09j2d\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/0283dt1\",\\n \"Confidence\": 0\\n },\\n {\\n \"Source\": \"verification\",\\n \"LabelName\": \"/m/0463sg\",\\n \"Confidence\": 0\\n }\\n ]\\n}'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"3122f16026310c3e\",\n", + " \"split\": \"test\",\n", + " \"image_id\": {\n", + " \"Subset\": \"test\",\n", + " \"OriginalURL\": \"https://c7.staticflickr.com/3/2275/2407197299_d4ea3fdab2_o.jpg\",\n", + " \"OriginalLandingURL\": \"https://www.flickr.com/photos/starshaped/2407197299\",\n", + " \"License\": \"https://creativecommons.org/licenses/by/2.0/\",\n", + " \"AuthorProfileURL\": \"https://www.flickr.com/people/starshaped/\",\n", + " \"Author\": \"Aubrey\",\n", + " \"Title\": \"My worst nightmare\",\n", + " \"OriginalSize\": 85370,\n", + " \"OriginalMD5\": \"O+50caXE4Ll4s3pmWcFa2w==\",\n", + " \"Thumbnail300KURL\": NaN,\n", + " \"Rotation\": NaN\n", + " },\n", + " \"classifications\": [\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/0k0pj\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/03q69\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/02dl1y\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/014sv8\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/0dzct\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/04hgtk\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/09j2d\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/0283dt1\",\n", + " \"Confidence\": 0\n", + " },\n", + " {\n", + " \"Source\": \"verification\",\n", + " \"LabelName\": \"/m/0463sg\",\n", + " \"Confidence\": 0\n", + " }\n", + " ]\n", + "}\n" + ] } ], "source": [ "uri = \"gs://datachain-demo/openimages-v6-test-jsonpairs/3122f16026310c3e.json\"\n", - "next(DataChain.from_storage(uri, type='text', anon=True).collect(\"file\")).read()" + "print(next(DataChain.from_storage(uri, type='text', anon=True).collect(\"file\")).read())" ] }, { @@ -985,7 +952,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "da0b968f-ec9b-4920-a617-ba28abfa6e8b", "metadata": {}, "outputs": [ @@ -993,15 +960,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Preparing: 1 rows [00:00, 1043.88 rows/s]\n", - "Download: 3.69MB [00:00, 108MB/s]\n", - "Processed: 1 rows [00:00, 951.31 rows/s]\n", - "Cleanup: 2 tables [00:00, 6689.48 tables/s]\n", "Processed: 0 rows [00:00, ? rows/s]\n", - "Download: 3.69MB [00:00, 125MB/s]\n", - "Processed: 1 rows [00:00, 1.89 rows/s]\n", - "Generated: 2 rows [00:00, 1103.04 rows/s]\n", - "Cleanup: 1 tables [00:00, 4568.96 tables/s]\n" + "Download: 3.69MB [00:00, 136MB/s]\n", + "Processed: 1 rows [00:00, 3.06 rows/s]\n", + "Generated: 2 rows [00:00, 1195.47 rows/s]\n", + "Cleanup: 1 tables [00:00, 766.22 tables/s]\n" ] }, { @@ -1120,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 40, "id": "c090e278-c82b-43c8-a0ff-26ff0f764da9", "metadata": {}, "outputs": [ @@ -1128,15 +1091,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Preparing: 1 rows [00:00, 844.60 rows/s]\n", - "Download: 102B [00:00, 7.56kB/s]\n", - "Processed: 1 rows [00:00, 756.96 rows/s]\n", - "Cleanup: 2 tables [00:00, 4301.85 tables/s]\n", "Processed: 0 rows [00:00, ? rows/s]\n", - "Download: 102B [00:00, 52.6kB/s]\n", - "Processed: 1 rows [00:00, 7.18 rows/s]\n", - "Generated: 1 rows [00:00, 684.45 rows/s]\n", - "Cleanup: 1 tables [00:00, 3557.51 tables/s]\n" + "Download: 102B [00:00, 38.6kB/s]\n", + "Processed: 1 rows [00:00, 8.28 rows/s]\n", + "Generated: 1 rows [00:00, 522.98 rows/s]\n", + "Cleanup: 1 tables [00:00, 3075.00 tables/s]\n" ] }, { @@ -1242,7 +1201,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "b40c1a34-d38d-43eb-a5ba-bf129be2a9b7", "metadata": {}, "outputs": [ @@ -1250,11 +1209,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 1 rows [00:00, 494.03 rows/s]\n", - "Generated: 5 rows [00:00, 4822.15 rows/s]\n", - "Processed: 1 rows [00:00, 739.74 rows/s]\n", - "Generated: 5 rows [00:00, 6121.28 rows/s]\n", - "Cleanup: 2 tables [00:00, 5062.53 tables/s]\n" + "Processed: 1 rows [00:00, 172.02 rows/s]\n", + "Generated: 5 rows [00:00, 1810.23 rows/s]\n", + "Processed: 1 rows [00:00, 52.27 rows/s]\n", + "Generated: 5 rows [00:00, 334.31 rows/s]\n", + "Cleanup: 2 tables [00:00, 489.33 tables/s]\n" ] }, { @@ -1353,7 +1312,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 37, "id": "24171361-1023-4728-bb4c-898db8399416", "metadata": {}, "outputs": [ @@ -1361,11 +1320,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 1 rows [00:00, 411.53 rows/s]\n", - "Generated: 5 rows [00:00, 3161.22 rows/s]\n", - "Processed: 1 rows [00:00, 333.09 rows/s]\n", - "Generated: 5 rows [00:00, 2876.75 rows/s]\n", - "Cleanup: 4 tables [00:00, 6665.56 tables/s]\n" + "Processed: 1 rows [00:00, 361.55 rows/s]\n", + "Generated: 5 rows [00:00, 2799.56 rows/s]\n", + "Processed: 1 rows [00:00, 439.75 rows/s]\n", + "Generated: 5 rows [00:00, 3362.44 rows/s]\n", + "Cleanup: 6 tables [00:00, 6894.75 tables/s]\n" ] }, { @@ -1473,7 +1432,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "03a5299c-b2b3-4a96-a90c-467114a9d9e6", "metadata": {}, "outputs": [], @@ -1498,7 +1457,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "63c3b599-8583-42ef-8f71-86fbfb88efda", "metadata": { "scrolled": true @@ -1508,10 +1467,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "Listing gs://datachain-demo: 5000 objects [00:00, 7055.82 objects/s]\n", - "Processed: 1 rows [00:00, 1.19 rows/s]\n", - "Generated: 5000 rows [00:00, 39021.94 rows/s]\n", - "Cleanup: 1 tables [00:00, 2557.50 tables/s]" + "Listing gs://datachain-demo: 5000 objects [00:00, 8579.58 objects/s]\n", + "Processed: 1 rows [00:00, 1.47 rows/s]\n", + "Generated: 5000 rows [00:00, 51172.27 rows/s]\n", + "Cleanup: 1 tables [00:00, 1831.57 tables/s]" ] }, { @@ -1552,22 +1511,12 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "d9c7d246-cb9e-4811-bb7a-8211fca499a8", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing: 1 rows [00:00, 1126.59 rows/s]\n", - "Download: 3.69MB [00:00, 126MB/s]\n", - "Processed: 1 rows [00:00, 1054.91 rows/s]\n", - "Cleanup: 2 tables [00:00, 2993.79 tables/s]" - ] - }, { "name": "stdout", "output_type": "stream", @@ -1582,13 +1531,6 @@ " flickr_url: str\n", " id: int\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ @@ -1607,7 +1549,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "8f548f45-477d-4052-9a42-1db1205ef65e", "metadata": {}, "outputs": [ @@ -1651,7 +1593,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "c190b270-5f0c-431e-835b-e685e40cb700", "metadata": {}, "outputs": [ @@ -1659,11 +1601,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 0 rows [00:00, ? rows/s]\n", - "Download: 3.69MB [00:00, 23.6MB/s]\n", - "Processed: 1 rows [00:00, 1.44 rows/s]\n", - "Generated: 5000 rows [00:00, 35636.40 rows/s]\n", - "Cleanup: 1 tables [00:00, 1305.42 tables/s]\n" + "Processed: 0 rows [00:00, ? rows/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Download: 3.69MB [00:00, 30.5MB/s]\n", + "Processed: 1 rows [00:00, 2.23 rows/s]\n", + "Generated: 5000 rows [00:00, 50565.22 rows/s]\n", + "Cleanup: 1 tables [00:00, 1577.40 tables/s]\n" ] }, { @@ -1672,7 +1621,7 @@ "5000" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1693,20 +1642,10 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "d4bbdb3b-39b9-42f5-99e2-311b168d5782", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing: 1 rows [00:00, 598.08 rows/s]\n", - "Download: 3.69MB [00:00, 117MB/s]\n", - "Processed: 1 rows [00:00, 974.51 rows/s]\n", - "Cleanup: 2 tables [00:00, 4946.11 tables/s]" - ] - }, { "name": "stdout", "output_type": "stream", @@ -1734,13 +1673,6 @@ " id: int\n", " caption: str\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ @@ -1772,24 +1704,10 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "41b38187", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing: 1 rows [00:00, 651.09 rows/s]\n", - "Download: 19.1MB [00:01, 18.0MB/s]\n", - "Processed: 1 rows [00:00, 972.71 rows/s]\n", - "Cleanup: 2 tables [00:00, 3731.59 tables/s]\n", - "Preparing: 1 rows [00:00, 907.47 rows/s]\n", - "Download: 19.1MB [00:01, 17.5MB/s]\n", - "Processed: 1 rows [00:00, 1029.02 rows/s]\n", - "Cleanup: 2 tables [00:00, 7667.83 tables/s]" - ] - }, { "name": "stdout", "output_type": "stream", @@ -1818,34 +1736,25 @@ " id: int\n", " \n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] } ], "source": [ "import sys\n", "from difflib import unified_diff\n", "\n", - "\n", "# Get JSON schema for annotations[0]\n", - "before = (next(\n", - " DataChain\n", - " .from_storage(detections_uri)\n", - " .print_json_schema(model_name=\"Narrative\", jmespath=\"annotations[0]\")\n", - " .collect(\"meta_schema\")\n", - "))\n", + "chain = DataChain.from_storage(detections_uri, type=\"text\", anon=True)\n", + "before = gen_datamodel_code(\n", + " next(chain.collect(\"file\")),\n", + " model_name=\"Narrative\",\n", + " jmespath=\"annotations[0]\"\n", + ")\n", "\n", "# Get JSON schema for annotations[36336]\n", - "after = next(\n", - " DataChain\n", - " .from_storage(detections_uri)\n", - " .print_json_schema(model_name=\"Narrative\", jmespath=\"annotations[36336]\")\n", - " .collect(\"meta_schema\")\n", + "after = gen_datamodel_code(\n", + " next(chain.collect(\"file\")),\n", + " model_name=\"Narrative\",\n", + " jmespath=\"annotations[36336]\"\n", ")\n", "\n", "sys.stdout.writelines(unified_diff(before.splitlines(keepends=True)[5:], after.splitlines(keepends=True)[5:]))" @@ -1863,7 +1772,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 26, "id": "46c51457-8137-440e-93b9-0e858e9f257e", "metadata": {}, "outputs": [ @@ -1871,20 +1780,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "Preparing: 1 rows [00:00, 671.20 rows/s]\n", - "Download: 19.1MB [00:01, 16.2MB/s]\n", - "Processed: 1 rows [00:00, 1100.29 rows/s]\n", - "Cleanup: 2 tables [00:00, 5332.87 tables/s]\n", "Processed: 0 rows [00:00, ? rows/s]\n", - "Download: 3.69MB [00:00, 23.9MB/s]\n", - "Processed: 1 rows [00:00, 2.07 rows/s]\n", - "Generated: 5000 rows [00:00, 35158.09 rows/s]\n", + "Download: 3.69MB [00:00, 28.9MB/s]\n", + "Processed: 1 rows [00:00, 2.19 rows/s]\n", + "Generated: 5000 rows [00:00, 49466.73 rows/s]\n", "Processed: 0 rows [00:00, ? rows/s]\n", "\u001b[A\n", "\u001b[A\n", - "Download: 3.69MB [00:00, 7.74MB/s]\n", - "Processed: 1 rows [00:00, 1.19 rows/s]\n", - "Generated: 25014 rows [00:00, 52029.02 rows/s]\n", + "Download: 3.69MB [00:00, 12.8MB/s]\n", + "Processed: 1 rows [00:00, 1.59 rows/s]\n", + "Generated: 25014 rows [00:00, 91687.79 rows/s]\n", "Processed: 0 rows [00:00, ? rows/s]\n", "\u001b[A\n", "\u001b[A\n", @@ -1900,10 +1805,10 @@ "\u001b[A\n", "\u001b[A\n", "\u001b[A\n", - "Download: 19.1MB [00:03, 6.33MB/s]\n", - "Processed: 1 rows [00:03, 3.55s/ rows]\n", - "Generated: 36335 rows [00:02, 17672.36 rows/s]\n", - "Cleanup: 21 tables [00:00, 2035.69 tables/s]\n" + "Download: 19.1MB [00:03, 6.48MB/s]\n", + "Processed: 1 rows [00:03, 3.44s/ rows]\n", + "Generated: 36335 rows [00:01, 19031.37 rows/s]\n", + "Cleanup: 6 tables [00:00, 673.64 tables/s]\n" ] } ], @@ -1914,7 +1819,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "id": "34b9eba3-f671-4c2f-aa0f-628f3f38897f", "metadata": {}, "outputs": [ @@ -1924,7 +1829,7 @@ "36403" ] }, - "execution_count": 25, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1935,7 +1840,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "id": "753b62c8-f9a7-4d9c-8842-2365aa2dd69c", "metadata": {}, "outputs": [ @@ -1991,7 +1896,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 29, "id": "4d2dd628-e37c-459c-a4b2-0243b7647005", "metadata": {}, "outputs": [ @@ -1999,15 +1904,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "Preparing: 1 rows [00:00, 1219.63 rows/s]\n", - "Download: 19.1MB [00:01, 16.4MB/s]\n", - "Processed: 1 rows [00:00, 1027.76 rows/s]\n", - "Cleanup: 2 tables [00:00, 5447.15 tables/s]\n", "Processed: 0 rows [00:00, ? rows/s]\n", - "Download: 19.1MB [00:01, 16.9MB/s]\n", - "Processed: 1 rows [00:01, 1.57s/ rows]\n", - "Generated: 80 rows [00:00, 25651.27 rows/s]\n", - "Cleanup: 1 tables [00:00, 5777.28 tables/s]\n" + "Download: 19.1MB [00:01, 16.3MB/s]\n", + "Processed: 1 rows [00:01, 1.61s/ rows]\n", + "Generated: 80 rows [00:00, 32338.50 rows/s]\n", + "Cleanup: 1 tables [00:00, 5236.33 tables/s]\n" ] } ], @@ -2032,7 +1933,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 30, "id": "16d992b7-2c08-43cc-ac8b-156a9d31ab64", "metadata": {}, "outputs": [], @@ -2049,7 +1950,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 31, "id": "1c56e573-d4d4-44af-8f6f-475320687e4d", "metadata": { "scrolled": true @@ -2063,7 +1964,7 @@ "" ] }, - "execution_count": 29, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -2083,7 +1984,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "id": "314c5b07-567b-4a99-98b9-80a735bb4876", "metadata": {}, "outputs": [ @@ -2095,7 +1996,7 @@ "" ] }, - "execution_count": 30, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2110,7 +2011,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 34, "id": "60e8b6f9-2d10-4365-bb90-1f7735211c2c", "metadata": {}, "outputs": [ @@ -2120,7 +2021,7 @@ "'A lot of people that are looking at a pool.'" ] }, - "execution_count": 30, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -2139,7 +2040,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "id": "15f316eb-2203-4bbb-8ab3-61feab9b1f2f", "metadata": {}, "outputs": [ @@ -2151,7 +2052,7 @@ "" ] }, - "execution_count": 31, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2179,7 +2080,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 35, "id": "f083d64a-d1f4-467a-9afb-e5036f583528", "metadata": {}, "outputs": [ @@ -2187,9 +2088,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 646 rows [00:00, 15217.24 rows/s]\n", - "Generated: 17 rows [00:00, 437.09 rows/s]\n", - "Cleanup: 2 tables [00:00, 5223.29 tables/s]\n" + "Processed: 646 rows [00:00, 10205.35 rows/s]\n", + "Generated: 17 rows [00:00, 284.39 rows/s]\n", + "Cleanup: 2 tables [00:00, 5090.17 tables/s]\n" ] }, { @@ -2198,7 +2099,7 @@ "17" ] }, - "execution_count": 32, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2230,7 +2131,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 36, "id": "6d4b5156-303e-4a38-8acb-197f54a7391d", "metadata": {}, "outputs": [ @@ -2238,8 +2139,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "Processed: 646 rows [00:00, 16034.94 rows/s]\n", - "Generated: 17 rows [00:00, 465.63 rows/s]\n" + "Processed: 646 rows [00:00, 16416.76 rows/s]\n", + "Generated: 17 rows [00:00, 481.05 rows/s]\n" ] }, { @@ -2250,7 +2151,7 @@ "" ] }, - "execution_count": 33, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" }