Skip to content

Commit

Permalink
feat(docx): add strategy parameter to partition_docx() (Unstructured-…
Browse files Browse the repository at this point in the history
…IO#3026)

**Summary**
The behavior of an image sub-partitioner can be partially determined by
the partitioning strategy, for example whether it is "hi_res" or "fast".
Add this parameter to `partition_docx()` so it can pass it along to
`DocxPartitionerOptions` which will make it available to any image
sub-partitioners.
  • Loading branch information
scanny authored May 15, 2024
1 parent a164b01 commit 094e354
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 6 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
## 0.13.8-dev10
## 0.13.8-dev11

### Enhancements

* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.

### Features

Expand Down
20 changes: 19 additions & 1 deletion test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@
Title,
)
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
from unstructured.partition.utils.constants import (
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
PartitionStrategy,
)

# -- docx-file loading behaviors -----------------------------------------------------------------

Expand Down Expand Up @@ -701,6 +704,7 @@ def opts_args() -> dict[str, Any]:
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": None,
}


Expand Down Expand Up @@ -905,6 +909,20 @@ def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
list(opts.increment_page_number())
assert opts.page_number == 4

# -- .strategy -------------------------------

@pytest.mark.parametrize(
("arg_value", "expected_value"),
[(None, "hi_res"), (PartitionStrategy.FAST, "fast"), (PartitionStrategy.HI_RES, "hi_res")],
)
def it_knows_which_partitioning_strategy_to_use(
self, opts_args: dict[str, Any], arg_value: str, expected_value: str
):
opts_args["strategy"] = arg_value
opts = DocxPartitionerOptions(**opts_args)

assert opts.strategy == expected_value

# -- ._document_contains_pagebreaks ----------

@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev10" # pragma: no cover
__version__ = "0.13.8-dev11" # pragma: no cover
21 changes: 18 additions & 3 deletions unstructured/partition/docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
is_possible_title,
is_us_city_state_zip,
)
from unstructured.partition.utils.constants import PartitionStrategy
from unstructured.utils import (
dependency_exists,
is_temp_file_path,
Expand Down Expand Up @@ -170,15 +171,17 @@ def extract_docx_filename(file_path: str) -> str:
@add_chunking_strategy
def partition_docx(
filename: Optional[str] = None,
*,
date_from_file_object: bool = False,
detect_language_per_element: bool = False,
file: Optional[IO[bytes]] = None,
include_page_breaks: bool = True,
infer_table_structure: bool = True,
languages: Optional[list[str]] = ["auto"],
metadata_filename: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
languages: Optional[list[str]] = ["auto"],
detect_language_per_element: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
strategy: Optional[str] = None,
**kwargs: Any,
) -> list[Element]:
"""Partitions Microsoft Word Documents in .docx format into its document elements.
Expand Down Expand Up @@ -226,6 +229,7 @@ def partition_docx(
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
strategy=strategy,
)

elements = _DocxPartitioner.iter_document_elements(opts)
Expand All @@ -252,6 +256,7 @@ def __init__(
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
strategy: str | None = None,
):
self._date_from_file_object = date_from_file_object
self._file = file
Expand All @@ -260,6 +265,7 @@ def __init__(
self._infer_table_structure = infer_table_structure
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
self._strategy = strategy
# -- options object maintains page-number state --
self._page_counter = starting_page_number

Expand Down Expand Up @@ -345,6 +351,15 @@ def page_number(self) -> int:
"""
return self._page_counter

@lazyproperty
def strategy(self) -> str:
"""The partitioning strategy for this document.
One of "hi_res", "fast", and a few others. These are available as class attributes on
`unstructured.partition.utils.constants.PartitionStrategy` but resolve to str values.
"""
return PartitionStrategy.HI_RES if self._strategy is None else self._strategy

@lazyproperty
def _document_contains_pagebreaks(self) -> bool:
"""True when there is at least one page-break detected in the document.
Expand Down

0 comments on commit 094e354

Please sign in to comment.