Skip to content

Commit

Permalink
INTPYTHON-418 Fix handling of missing document fields (#254)
Browse files Browse the repository at this point in the history
  • Loading branch information
blink1073 authored Dec 18, 2024
1 parent 7c4cf76 commit 37ac8ee
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 3 deletions.
12 changes: 10 additions & 2 deletions bindings/python/pymongoarrow/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(self, schema, codec_options=None):
from pymongoarrow.lib import BuilderManager

self.manager = BuilderManager(schema_map, self.schema is not None, self.tzinfo)
self.schema_map = schema_map

def process_bson_stream(self, stream):
self.manager.process_bson_stream(stream, len(stream))
Expand All @@ -59,8 +60,15 @@ def _parse_builder_map(builder_map):
# Traverse the builder map right to left.
for key, value in reversed(builder_map.items()):
if value.type_marker == _BsonArrowTypes.document.value:
names = value.finish()
full_names = [f"{key}.{name}" for name in names]
names = []
full_names = []
for candidate in list(builder_map):
if candidate.startswith(key + "."):
name = candidate[len(key) + 1 :]
if "." in name or "[" in name:
continue
names.append(name)
full_names.append(candidate)
arrs = [builder_map[c] for c in full_names]
builder_map[key] = StructArray.from_arrays(arrs, names=names)
to_remove.extend(full_names)
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/pymongoarrow/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ cdef class BuilderManager:
# We only use the doc_iter for binary arrays, which are handled already.
self.get_builder(name, ftype, <bson_iter_t *>nullptr)

cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter) except *:
cdef _ArrayBuilderBase get_builder(self, cstring key, bson_type_t value_t, bson_iter_t * doc_iter):
cdef _ArrayBuilderBase builder = None
cdef bson_subtype_t subtype
cdef const uint8_t *val_buf = NULL
Expand Down
26 changes: 26 additions & 0 deletions bindings/python/test/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,32 @@ def test_string_bool(self):
),
)

def test_schema_missing_field(self):
self.coll.drop()
self.coll.insert_one(
{
"_id": ObjectId("000000000000000000000013"),
"list_field": [{"name": "Test1", "test": "Test2"}],
}
)

schema = Schema(
{
"_id": ObjectId,
"list_field": [
{
"name": pa.string(),
"test": pa.string(),
"test_test": pa.string(), # does not exist in the database collection
}
],
}
)
expected = [[{"name": "Test1", "test": "Test2", "test_test": None}]]
for func in [find_arrow_all, aggregate_arrow_all]:
out = func(self.coll, {} if func == find_arrow_all else [], schema=schema).drop(["_id"])
self.assertEqual(out["list_field"].to_pylist(), expected)

def test_auto_schema_nested(self):
# Create table with random data of various types.
_, data = self._create_nested_data()
Expand Down

0 comments on commit 37ac8ee

Please sign in to comment.