Skip to content

Commit

Permalink
Changes introduce new mode parameter (#103)
Browse files Browse the repository at this point in the history
* Changes introduce new mode parameter

* Updated the SDK version

* Updated the SDK version

* Update src/unstract/sdk/__init__.py

Co-authored-by: Chandrasekharan M <[email protected]>
Signed-off-by: Rahul Johny <[email protected]>

* Updated the SDK version

* Updated the comments

---------

Signed-off-by: Rahul Johny <[email protected]>
Co-authored-by: Chandrasekharan M <[email protected]>
  • Loading branch information
johnyrahul and chandrasekharan-zipstack authored Sep 23, 2024
1 parent 84f4985 commit 2a0d17b
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ class ProcessingModes(Enum):
TEXT = "text"


class Modes(Enum):
NATIVE_TEXT = "native_text"
LOW_COST = "low_cost"
HIGH_QUALITY = "high_quality"
FORM = "form"


class OutputModes(Enum):
LINE_PRINTER = "line-printer"
DUMP_TEXT = "dump-text"
Expand Down Expand Up @@ -52,6 +59,7 @@ class WhispererConfig:

URL = "url"
PROCESSING_MODE = "processing_mode"
MODE = "mode"
OUTPUT_MODE = "output_mode"
UNSTRACT_KEY = "unstract_key"
MEDIAN_FILTER_SIZE = "median_filter_size"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ def _get_whisper_params(self, enable_highlight: bool = False) -> dict[str, Any]:
WhispererConfig.PROCESSING_MODE: self.config.get(
WhispererConfig.PROCESSING_MODE, ProcessingModes.TEXT.value
),
# Not providing default value to maintain legacy compatablity
# Providing default value will overide the params
# processing_mode, force_text_processing
WhispererConfig.MODE: self.config.get(WhispererConfig.MODE),
WhispererConfig.OUTPUT_MODE: self.config.get(
WhispererConfig.OUTPUT_MODE, OutputModes.LINE_PRINTER.value
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,17 @@
"format": "password",
"description": "API key obtained from the Unstract developer portal (https://unstract-api-resource.developer.azure-api.net)"
},
"processing_mode": {
"mode": {
"type": "string",
"title": "Processing Mode",
"title": "Mode",
"enum": [
"text",
"ocr"
"native_text",
"low_cost",
"high_quality",
"form"
],
"default": "text",
"description": "Text mode tries to extract text from PDF and falls to OCR if the PDF is a scanned image PDF. This should be your default selection. Use OCR mode if you want to force OCR to extract text. This could be useful if you are dealing with malformed PDFs."
},
"force_text_processing": {
"type": "boolean",
"title": "Force Text Processing",
"default": false,
"description": "If checked, ensures that only text processing runs and there is no OCR involved. This differs from the default behaviour where we fall back to OCR processing in case of failures with text processing."
"default": "form",
"description": "Native text : Extracts text from PDF without OCR. This is very fast and cost effective. Use this mode if you are sure all your PDFs are native text pdfs (not scanned documents). Note that some scanned PDFs are \"searchable\" PDFs. Use the OCR modes for these PDFs as the quality of text in these documents are often poor. \n Low cost : Extracts text from scanned and native PDFs, images and office documents. This OCR mode cannot handle handwriting and low quality scanned pdfs and images. \n High quality : Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. \n Form: Extracts text from scanned and native PDFs, images and office documents. This OCR mode can handle handwriting and low quality scanned pdfs and images. Can also extract information about checkboxes and radio button"
},
"output_mode": {
"type": "string",
Expand All @@ -53,18 +49,7 @@
"default": "line-printer",
"description": "The output format. Valid options are line-printer, dump-text and text. The line-printer mode tries to maintain the layout of the original text and works very well as inputs to LLMs. dump-text just dumps each page as paragraphs. text extracts text into groups as it sees in the original page. text and dump-text are treated as same in ocr processing mode."
},
"median_filter_size": {
"type": "integer",
"title": "Median Filter Size",
"default": 0,
"description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
},
"gaussian_blur_radius": {
"type": "number",
"title": "Gaussian Blur Radius",
"default": 0.0,
"description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
},

"line_splitter_tolerance": {
"type": "number",
"title": "Line Splitter Tolerance",
Expand Down Expand Up @@ -92,18 +77,48 @@
}
},
"if": {
"properties": {
"force_text_processing": {
"const": "false"
"anyOf": [
{
"properties": {
"mode": {
"const": "low_cost"
}
}
},
{
"properties": {
"mode": {
"const": "high_quality"
}
}
},
{
"properties": {
"mode": {
"const": "form"
}
}
}
}
]
},
"then": {
"properties": {
"required": [
"median_filter_size",
"gaussian_blur_radius"
]
}
"median_filter_size": {
"type": "integer",
"title": "Median Filter Size",
"default": 0,
"description": "The size of the median filter to use for pre-processing the image during OCR based extraction. Useful to eliminate scanning artifacts and low quality JPEG artifacts. Default is 0 if the value is not explicitly set. Available only in the Enterprise version."
},
"gaussian_blur_radius": {
"type": "number",
"title": "Gaussian Blur Radius",
"default": 0.0,
"description": "The radius of the gaussian blur to use for pre-processing the image during OCR based extraction. Useful to eliminate noise from the image. Default is 0.0 if the value is not explicitly set. Available only in the Enterprise version."
}
},
"required": [
"median_filter_size",
"gaussian_blur_radius"
]
}
}

0 comments on commit 2a0d17b

Please sign in to comment.