Skip to content

Commit

Permalink
cleanup a few from_json methods from DC (#727)
Browse files Browse the repository at this point in the history
  • Loading branch information
shcheklein authored Dec 22, 2024
1 parent 20c73b2 commit aed6d96
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 198 deletions.
59 changes: 20 additions & 39 deletions examples/get_started/json-csv-reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from datachain import C, DataChain
from datachain.lib.data_model import ModelStore
from datachain.lib.meta_formats import gen_datamodel_code


# Sample model for static JSON model
Expand All @@ -28,71 +29,51 @@ class ChatDialog(BaseModel):


def main():
print()
print("========================================================================")
print("Dynamic JSONl schema from 2 objects")
print("========================================================================")
# Dynamic JSONl schema from 2 objects
uri = "gs://datachain-demo/jsonl/object.jsonl"
jsonl_ds = DataChain.from_json(uri, meta_type="jsonl", print_schema=True)
jsonl_ds = DataChain.from_json(uri, format="jsonl", anon="True")
jsonl_ds.show()

print()
print("========================================================================")
print("Dynamic JSON schema from 200 OpenImage json-pairs with validation errors")
print("========================================================================")
# Dynamic JSON schema from 200 OpenImage json-pairs with validation errors
uri = "gs://datachain-demo/openimages-v6-test-jsonpairs/*json"
schema_uri = (
"gs://datachain-demo/openimages-v6-test-jsonpairs/08392c290ecc9d2a.json"
)
json_pairs_ds = DataChain.from_json(
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage"
uri, schema_from=schema_uri, jmespath="@", model_name="OpenImage", anon="True"
)
json_pairs_ds.show()

uri = "gs://datachain-demo/coco2017/annotations_captions/"

print()
print("========================================================================")
print("Reading JSON schema from main COCO annotation")
print("========================================================================")
chain = (
DataChain.from_storage(uri)
.filter(C("file.path").glob("*.json"))
.print_json_schema(jmespath="@", model_name="Coco")
# Print JSON schema in Pydantic format from main COCO annotation
chain = DataChain.from_storage(uri, anon="True").filter(
C("file.path").glob("*.json")
)
chain.save()
file = next(chain.limit(1).collect("file"))
print(gen_datamodel_code(file, jmespath="@", model_name="Coco"))

print()
print("========================================================================")
print("static JSON schema test parsing 3/7 objects")
print("========================================================================")
# Static JSON schema test parsing 3/7 objects
static_json_ds = DataChain.from_json(
uri, jmespath="licenses", spec=LicenseFeature, nrows=3
uri, jmespath="licenses", spec=LicenseFeature, nrows=3, anon="True"
)
static_json_ds.show()

print()
print("========================================================================")
print("dynamic JSON schema test parsing 5K objects")
print("========================================================================")
dynamic_json_ds = DataChain.from_json(uri, jmespath="images", print_schema=True)
# Dynamic JSON schema test parsing 5K objects
dynamic_json_ds = DataChain.from_json(uri, jmespath="images", anon="True")
print(dynamic_json_ds.to_pandas())

# Static CSV with header schema test parsing 3.5K objects
uri = "gs://datachain-demo/chatbot-csv/"
print()
print("========================================================================")
print("static CSV with header schema test parsing 3.5K objects")
print("========================================================================")
static_csv_ds = DataChain.from_csv(uri, output=ChatDialog, object_name="chat")
static_csv_ds = DataChain.from_csv(
uri, output=ChatDialog, object_name="chat", anon="True"
)
static_csv_ds.print_schema()
static_csv_ds.show()

# Dynamic CSV with header schema test parsing 3/3M objects
uri = "gs://datachain-demo/laion-aesthetics-csv/laion_aesthetics_1024_33M_1.csv"
print()
print("========================================================================")
print("dynamic CSV with header schema test parsing 3/3M objects")
print("========================================================================")
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3)
dynamic_csv_ds = DataChain.from_csv(uri, object_name="laion", nrows=3, anon="True")
dynamic_csv_ds.print_schema()
dynamic_csv_ds.show()

Expand Down
115 changes: 6 additions & 109 deletions src/datachain/lib/dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
parse_listing_uri,
)
from datachain.lib.listing_info import ListingInfo
from datachain.lib.meta_formats import read_meta, read_schema
from datachain.lib.meta_formats import read_meta
from datachain.lib.model_store import ModelStore
from datachain.lib.settings import Settings
from datachain.lib.signal_schema import SignalSchema
Expand Down Expand Up @@ -554,8 +554,7 @@ def from_json(
jmespath: Optional[str] = None,
object_name: Optional[str] = "",
model_name: Optional[str] = None,
print_schema: Optional[bool] = False,
meta_type: Optional[str] = "json",
format: Optional[str] = "json",
nrows=None,
**kwargs,
) -> "DataChain":
Expand All @@ -564,12 +563,12 @@ def from_json(
Parameters:
path : storage URI with directory. URI must start with storage prefix such
as `s3://`, `gs://`, `az://` or "file:///"
type : read file as "binary", "text", or "image" data. Default is "binary".
type : read file as "binary", "text", or "image" data. Default is "text".
spec : optional Data Model
schema_from : path to sample to infer spec (if schema not provided)
object_name : generated object column name
model_name : optional generated model name
print_schema : print auto-generated schema
format: "json", "jsonl"
jmespath : optional JMESPATH expression to reduce JSON
nrows : optional row limit for jsonl and JSON arrays
Expand All @@ -594,75 +593,14 @@ def jmespath_to_name(s: str):
if (not object_name) and jmespath:
object_name = jmespath_to_name(jmespath)
if not object_name:
object_name = meta_type
object_name = format
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
signal_dict = {
object_name: read_meta(
schema_from=schema_from,
meta_type=meta_type,
format=format,
spec=spec,
model_name=model_name,
print_schema=print_schema,
jmespath=jmespath,
nrows=nrows,
)
}
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]

@classmethod
def from_jsonl(
cls,
path,
type: Literal["binary", "text", "image"] = "text",
spec: Optional[DataType] = None,
schema_from: Optional[str] = "auto",
jmespath: Optional[str] = None,
object_name: Optional[str] = "",
model_name: Optional[str] = None,
print_schema: Optional[bool] = False,
meta_type: Optional[str] = "jsonl",
nrows=None,
**kwargs,
) -> "DataChain":
"""Get data from JSON lines. It returns the chain itself.
Parameters:
path : storage URI with directory. URI must start with storage prefix such
as `s3://`, `gs://`, `az://` or "file:///"
type : read file as "binary", "text", or "image" data. Default is "binary".
spec : optional Data Model
schema_from : path to sample to infer spec (if schema not provided)
object_name : generated object column name
model_name : optional generated model name
print_schema : print auto-generated schema
jmespath : optional JMESPATH expression to reduce JSON
nrows : optional row limit for jsonl and JSON arrays
Example:
infer JSONl schema from data, limit parsing to 1 row
```py
chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
```
"""
if schema_from == "auto":
schema_from = path

def jmespath_to_name(s: str):
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
return s[:name_end]

if (not object_name) and jmespath:
object_name = jmespath_to_name(jmespath)
if not object_name:
object_name = meta_type
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
signal_dict = {
object_name: read_meta(
schema_from=schema_from,
meta_type=meta_type,
spec=spec,
model_name=model_name,
print_schema=print_schema,
jmespath=jmespath,
nrows=nrows,
)
Expand Down Expand Up @@ -793,47 +731,6 @@ def listings(
**{object_name: catalog.listings()}, # type: ignore[arg-type]
)

def print_json_schema( # type: ignore[override]
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
) -> "Self":
"""Print JSON data model and save it. It returns the chain itself.
Parameters:
jmespath : JMESPATH expression to reduce JSON
model_name : generated model name
Example:
print JSON schema and save to column "meta_from":
```py
uri = "gs://datachain-demo/coco2017/annotations_captions/"
chain = DataChain.from_storage(uri)
chain = chain.print_json_schema()
chain.save()
```
"""
return self.map(
meta_schema=lambda file: read_schema(
file, data_type="json", expr=jmespath, model_name=model_name
),
output=str,
)

def print_jsonl_schema( # type: ignore[override]
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
) -> "Self":
"""Print JSON data model and save it. It returns the chain itself.
Parameters:
jmespath : JMESPATH expression to reduce JSON
model_name : generated model name
"""
return self.map(
meta_schema=lambda file: read_schema(
file, data_type="jsonl", expr=jmespath, model_name=model_name
),
output=str,
)

def save( # type: ignore[override]
self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
) -> "Self":
Expand Down
Loading

0 comments on commit aed6d96

Please sign in to comment.