Skip to content

Commit

Permalink
blla segmenter basically working
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Dec 30, 2020
1 parent 07fd913 commit c530706
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 55 deletions.
1 change: 0 additions & 1 deletion ocrd_kraken/ocrd-tool.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
"description": "Block segmentation with kraken",
"parameters": {
"text_direction": {"type": "string", "description": "Sets principal text direction", "enum": ["horizontal-lr", "horizontal-rl", "vertical-lr", "vertical-rl"], "default": "horizontal-lr"},
"script_detect": {"type": "boolean", "description": "Enable script detection on segmenter output", "default": false},
"maxcolseps": {"description": "maximum number of column separators", "type": "number", "format": "integer", "default": 2},
"scale": {"description": "scale factor", "type": "number", "format": "float", "default": 0},
"black_colseps": {"description": "Whether column separators are assumed to be vertical black lines or not", "type": "boolean", "default": false},
Expand Down
71 changes: 38 additions & 33 deletions ocrd_kraken/segment.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from __future__ import absolute_import
from os.path import join

from ocrd import Processor
from ocrd_utils import (
getLogger,
assert_file_grp_cardinality,
make_file_id,
concat_padded,
points_from_x0y0x1y1,
points_from_polygon,
MIMETYPE_PAGE
)
from ocrd_models.ocrd_page import TextRegionType, TextLineType, CoordsType, to_xml
from ocrd_models.ocrd_page import TextRegionType, TextLineType, CoordsType, BaselineType, to_xml
from ocrd_modelfactory import page_from_file

import shapely.geometry as geom
from kraken.lib.vgsl import TorchVGSLModel
from kraken.pageseg import segment as legacy_segment
from kraken.blla import segment as blla_segment
Expand All @@ -33,7 +36,7 @@ def process(self):
assert_file_grp_cardinality(self.output_file_grp, 1)
kwargs = {}
kwargs['text_direction'] = self.parameter['text_direction']
use_legacy = self.parameter['use_legacy']:
use_legacy = self.parameter['use_legacy']
if use_legacy:
kwargs['scale'] = self.parameter['scale']
kwargs['maxcolseps'] = self.parameter['maxcolseps']
Expand All @@ -42,44 +45,46 @@ def process(self):
segment = legacy_segment
else:
log.info("Using blla segmenter")
blla_model_fname = self.resolve_resource(parameter['blla_model'])
kwargs['model'] = TorchVGSLModel(blla_model_fname)
blla_model_fname = self.resolve_resource(self.parameter['blla_model'])
kwargs['model'] = TorchVGSLModel.load_model(blla_model_fname)
kwargs['device'] = self.parameter['device']
segment = blla_segment

for (n, input_file) in enumerate(self.input_files):
page_id = input_file.pageId or input_file.ID
self.logger.info("INPUT FILE %i / %s", n, page_id)
log.info("INPUT FILE %i / %s of %s", n, page_id, len(self.input_files))
pcgts = page_from_file(self.workspace.download_file(input_file))
self.add_metadata(pcgts)
page = pcgts.get_Page()
page_image, page_coords, page_image_info = self.workspace.image_from_page(page, page_id, feature_selector="binarized")
log.info('Segmenting')
log.info('Params %s', self.parameter)
res = segment(
page_image,
self.parameter['text_direction'],
self.parameter['scale'],
self.parameter['maxcolseps'],
self.parameter['black_colseps']
)
if self.parameter['script_detect']:
res = detect_scripts(im, res)

dummyRegion = TextRegionType()
pcgts.get_Page().add_TextRegion(dummyRegion)
# print(res)
for lineno, box in enumerate(res['boxes']):
textline = TextLineType(
id=concat_padded("line", lineno),
Coords=CoordsType(points=points_from_x0y0x1y1(box))
)
dummyRegion.add_TextLine(textline)
ID = concat_padded(self.output_file_grp, n)
page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_selector="binarized")
log.info('Segmenting with %s segmenter' % ('legacy' if use_legacy else 'blla'))
res = segment(page_image, **kwargs)
log.info("Finished segmentation, serializing")
if use_legacy:
raise NotImplementedError("legacy segmenter NIH")
else:
for idx_region, region_polygon_ in enumerate(res['regions']['text']):
region_elem = TextRegionType(
id=f'region_{idx_region}',
Coords=CoordsType(points=points_from_polygon(region_polygon_)))
region_polygon = geom.Polygon(region_polygon_)
line_idx = 0
for line_dict in res['lines']:
line_polygon = geom.Polygon(line_dict['boundary'])
if region_polygon.contains(line_polygon):
region_elem.add_TextLine(TextLineType(
id=f'region_{idx_region}_line_{line_idx}',
Baseline=BaselineType(points=points_from_polygon(line_dict['baseline'])),
Coords=CoordsType(points=points_from_polygon(line_dict['boundary']))))
# TODO handle unmatched or twice-matched lines
line_idx += 1
page.add_TextRegion(region_elem)
file_id = make_file_id(input_file, self.output_file_grp)
pcgts.set_pcGtsId(file_id)
self.workspace.add_file(
self.output_file_grp,
ID=file_id,
file_grp=self.output_file_grp,
pageId=input_file.pageId,
ID=ID,
mimetype=MIMETYPE_PAGE,
local_filename="%s/%s.xml" % (self.output_file_grp, ID),
content=to_xml(pcgts).encode('utf-8'))
local_filename=join(self.output_file_grp, f'{file_id}.xml'),
content=to_xml(pcgts))
88 changes: 84 additions & 4 deletions tests/base.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,89 @@
# pylint: disable=unused-import

import os
from os.path import dirname, realpath
from os import chdir
import sys
from unittest import TestCase, skip, main # pylint: disable=unused-import
import logging
import io
import collections
from unittest import TestCase as VanillaTestCase, skip, main as unittests_main
import pytest
from ocrd_utils import disableLogging, initLogging

from tests.assets import assets
from tests.assets import assets, copy_of_directory

PWD = os.path.dirname(os.path.realpath(__file__))

def main(fn=None):
if fn:
sys.exit(pytest.main([fn]))
else:
unittests_main()


class TestCase(VanillaTestCase):

@classmethod
def setUpClass(cls):
chdir(dirname(realpath(__file__)) + '/..')

def setUp(self):
disableLogging()
initLogging()

class CapturingTestCase(TestCase):
"""
A TestCase that needs to capture stderr/stdout and invoke click CLI.
"""

@pytest.fixture(autouse=True)
def _setup_pytest_capfd(self, capfd):
self.capfd = capfd

def invoke_cli(self, cli, args):
"""
Substitution for click.CliRunner.invooke that works together nicely
with unittests/pytest capturing stdout/stderr.
"""
self.capture_out_err() # XXX snapshot just before executing the CLI
code = 0
sys.argv[1:] = args # XXX necessary because sys.argv reflects pytest args not cli args
try:
cli.main(args=args)
except SystemExit as e:
code = e.code
out, err = self.capture_out_err()
return code, out, err

def capture_out_err(self):
return self.capfd.readouterr()

# import traceback
# import warnings
# def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
# log = file if hasattr(file, 'write') else sys.stderr
# traceback.print_stack(file=log)
# log.write(warnings.formatwarning(message, category, filename, lineno, line))
# warnings.showwarning = warn_with_traceback

# https://stackoverflow.com/questions/37944111/python-rolling-log-to-a-variable
# Adapted from http://alanwsmith.com/capturing-python-log-output-in-a-variable

class FIFOIO(io.TextIOBase):
def __init__(self, size, *args):
self.maxsize = size
io.TextIOBase.__init__(self, *args)
self.deque = collections.deque()
def getvalue(self):
return ''.join(self.deque)
def write(self, x):
self.deque.append(x)
self.shrink()
def shrink(self):
if self.maxsize is None:
return
size = sum(len(x) for x in self.deque)
while size > self.maxsize:
x = self.deque.popleft()
size -= len(x)

sys.path.append(dirname(realpath(__file__)) + '/../ocrd')
33 changes: 16 additions & 17 deletions tests/test_kraken_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,29 @@

from tests.base import TestCase, assets, main

from ocrd.resolver import Resolver
from ocrd import Resolver
from ocrd_utils import initLogging, pushd_popd
from ocrd_kraken.segment import KrakenSegment
PARAM_JSON = assets.url_of('param-segment.json')

WORKSPACE_DIR = '/tmp/ocrd-ocropy-segment-test'

class TestKrakenSegment(TestCase):

def setUp(self):
if os.path.exists(WORKSPACE_DIR):
shutil.rmtree(WORKSPACE_DIR)
os.makedirs(WORKSPACE_DIR)
initLogging()

def test_run1(self):
resolver = Resolver()
workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR)
proc = KrakenSegment(
workspace,
input_file_grp="OCR-D-IMG-BIN",
output_file_grp="OCR-D-SEG-LINE-KRAKEN",
parameter={'level-of-operation': 'line'}
)
proc.process()
workspace.save_mets()
# with pushd_popd(tempdir=True) as tempdir:
with pushd_popd('/tmp/kraken-test') as tempdir:
workspace = resolver.workspace_from_url(assets.path_to('communist_manifesto/data/mets.xml'), dst_dir=tempdir)
proc = KrakenSegment(
workspace,
input_file_grp="OCR-D-IMG-BIN",
output_file_grp="OCR-D-SEG-LINE-KRAKEN",
parameter={'level-of-operation': 'line'}
)
proc.process()
workspace.save_mets()
assert 0

if __name__ == "__main__":
main()
main(__file__)

0 comments on commit c530706

Please sign in to comment.