diff --git a/src/Tesseract/Interop/BaseApi.cs b/src/Tesseract/Interop/BaseApi.cs index 048d4ab9..58de769c 100644 --- a/src/Tesseract/Interop/BaseApi.cs +++ b/src/Tesseract/Interop/BaseApi.cs @@ -96,6 +96,9 @@ int BaseApiInit(HandleRef handle, string datapath, string language, int mode, [RuntimeDllImport(Constants.TesseractDllName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "TessBaseAPIMeanTextConf")] int BaseAPIMeanTextConf(HandleRef handle); + [RuntimeDllImport(Constants.TesseractDllName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "TessBaseAPIProcessPage")] + int BaseAPIProcessPage(HandleRef handle, HandleRef pix, int page_index, string filename, string retry_config, int timeout_millisec, HandleRef renderer); + [RuntimeDllImport(Constants.TesseractDllName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "TessBaseAPIRecognize")] int BaseApiRecognize(HandleRef handle, HandleRef monitor); diff --git a/src/Tesseract/Page.cs b/src/Tesseract/Page.cs index 2fc388d1..369dac9f 100644 --- a/src/Tesseract/Page.cs +++ b/src/Tesseract/Page.cs @@ -122,6 +122,16 @@ public string GetText() return Interop.TessApi.BaseAPIGetUTF8Text(Engine.Handle); } + public string GetText(int timeout) + { + if (!Recognize(0, timeout)) + { + return null; + } + + return Interop.TessApi.BaseAPIGetUTF8Text(Engine.Handle); + } + /// /// Gets the page's content as a HOCR text. /// @@ -139,6 +149,22 @@ public string GetHOCRText(int pageNum, bool useXHtml = false) return Interop.TessApi.BaseAPIGetHOCRText(Engine.Handle, pageNum); } + public string GetHOCRText(int pageNum, int timeout, bool useXHtml = false) + { + //Why Not Use 'nameof(pageNum)' instead of '"pageNum"' + Guard.Require("pageNum", pageNum >= 0, "Page number must be greater than or equal to zero (0)."); + + if (!Recognize(pageNum, timeout)) + { + return null; + } + + if (useXHtml) + return Interop.TessApi.BaseAPIGetHOCRText2(Engine.Handle, pageNum); + else + return Interop.TessApi.BaseAPIGetHOCRText(Engine.Handle, pageNum); + } + /// /// Get's the mean confidence that as a percentage of the recognized text. /// @@ -295,6 +321,51 @@ internal void Recognize() } } + private bool Recognize(int pageNum, int timeout) + { + Guard.Verify(PageSegmentMode != PageSegMode.OsdOnly, "Cannot OCR image when using OSD only page segmentation, please use DetectBestOrientation instead."); + + //string strText = null; + + int success = -1; + + if (!runRecognitionPhase) + { + //Interop.TessApi.BaseApiSetVariable(Engine.Handle, "tessedit_create_hocr", "1"); + string fileName = Path.Combine(System.IO.Path.GetTempPath(), Guid.NewGuid().ToString()); + //IntPtr renderer = Interop.TessApi.Native.HOcrRendererCreate(fileName); + success = Interop.TessApi.Native.BaseAPIProcessPage(Engine.Handle, Image.Handle, pageNum, null, null, timeout, new HandleRef(this, IntPtr.Zero)); + } + + if (success == 1) + { + runRecognitionPhase = true; + + // now write out the thresholded image if required to do so + bool tesseditWriteImages; + if (Engine.TryGetBoolVariable("tessedit_write_images", out tesseditWriteImages) && tesseditWriteImages) + { + using (Pix thresholdedImage = GetThresholdedImage()) + { + string filePath = Path.Combine(Environment.CurrentDirectory, "tessinput.tif"); + try + { + thresholdedImage.Save(filePath, ImageFormat.TiffG4); + trace.TraceEvent(TraceEventType.Information, 2, + "Successfully saved the thresholded image to '{0}'", filePath); + } + catch (Exception error) + { + trace.TraceEvent(TraceEventType.Error, 2, + "Failed to save the thresholded image to '{0}'.\nError: {1}", filePath, error.Message); + } + } + } + } + + return success == 1; + } + protected override void Dispose(bool disposing) { if (disposing) {