Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: support for multiple output formats #25

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Python/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
32 changes: 16 additions & 16 deletions Python/AbbyyOnlineSdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
import urllib2
import urllib
import xml.dom.minidom
import xml.sax.saxutils

class ProcessingSettings:
Language = "English"
OutputFormat = "docx"

OutputFormats = ["docx"]

class Task:
Status = "Unknown"
Expand All @@ -41,9 +41,10 @@ class AbbyyOnlineSdk:
enableDebugging = 0

def ProcessImage( self, filePath, settings ):
outputFormats = ','.join(settings.OutputFormats)
urlParams = urllib.urlencode({
"language" : settings.Language,
"exportFormat" : settings.OutputFormat
"exportFormat" : outputFormats
})
requestUrl = self.ServerUrl + "processImage?" + urlParams

Expand All @@ -66,14 +67,10 @@ def GetTaskStatus( self, task ):
task = self.DecodeResponse( response )
return task

def DownloadResult( self, task, outputPath ):
getResultParams = urllib.urlencode( { "taskId" : task.Id } )
getResultUrl = self.ServerUrl + "getResult?" + getResultParams
request = urllib2.Request( getResultUrl, None, self.buildAuthInfo() )
fileResponse = self.getOpener().open( request ).read()
resultFile = open( outputPath, "wb" )
resultFile.write( fileResponse )

def DownloadResult( self, resultUrl, outputPath):
fileResponse = urllib2.urlopen(resultUrl).read()
resultFile = open( outputPath, "wb" )
resultFile.write( fileResponse )

def DecodeResponse( self, xmlResponse ):
""" Decode xml response of the server. Return Task object """
Expand All @@ -83,7 +80,12 @@ def DecodeResponse( self, xmlResponse ):
task.Id = taskNode.getAttribute( "id" )
task.Status = taskNode.getAttribute( "status" )
if task.Status == "Completed":
task.DownloadUrl = taskNode.getAttribute( "resultUrl" )
task.DownloadUrl = []
for resultAttr in ('resultUrl', 'resultUrl2', 'resultUrl3'):
result = taskNode.getAttribute(resultAttr)
if result:
# xml escape needs to be unescaped
task.DownloadUrl.append(xml.sax.saxutils.unescape(result))
return task


Expand All @@ -95,10 +97,8 @@ def getOpener( self ):
self.opener = urllib2.build_opener( MultipartPostHandler.MultipartPostHandler,
urllib2.HTTPHandler(debuglevel=self.enableDebugging))
else:
self.opener = urllib2.build_opener(
self.Proxy,
self.opener = urllib2.build_opener(
self.Proxy,
MultipartPostHandler.MultipartPostHandler,
urllib2.HTTPHandler(debuglevel=self.enableDebugging))
return self.opener


70 changes: 51 additions & 19 deletions Python/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import time
import urllib2
import urllib
import urlparse

from AbbyyOnlineSdk import *

Expand All @@ -32,11 +33,11 @@


# Recognize a file at filePath and save result to resultFilePath
def recognizeFile( filePath, resultFilePath, language, outputFormat ):
def recognizeFile( filePath, resultFilePath, language, outputFormats ):
print "Uploading.."
settings = ProcessingSettings()
settings.Language = language
settings.OutputFormat = outputFormat
settings.OutputFormats = outputFormats
task = processor.ProcessImage( filePath, settings )
if task == None:
print "Error"
Expand All @@ -60,35 +61,66 @@ def recognizeFile( filePath, resultFilePath, language, outputFormat ):
task = processor.GetTaskStatus( task )

print "Status = %s" % task.Status

if task.Status == "Completed":
if task.DownloadUrl != None:
processor.DownloadResult( task, resultFilePath )
print "Result was written to %s" % resultFilePath
for i, url in enumerate(task.DownloadUrl):
fmt = outputFormats[i]
extension = OutputFormat.getExtension(fmt)
outputBaseName = os.path.basename(urlparse.urlparse(url).path)
outputFile = os.path.join(
resultFilePath, outputBaseName + '.' + extension)
processor.DownloadResult(url, outputFile)
# processor.DownloadResult( task, resultFilePath )
print "Result was written to %s" % outputFile
else:
print "Error processing task"




class OutputFormat(str):
_availableFormats = (
'txt', 'pdfSearchable', 'pdfTextAndImages', 'pdfa', 'rft',
'pptx', 'docx', 'xml', 'alto'
)
default = 'txt'
@classmethod
def type(cls, formatString):
if not formatString in cls._availableFormats:
raise argparse.ArgumentTypeError(
'invalid format {}'.format(formatString))
return formatString

@classmethod
def getExtension(cls, formatString):
return {
'pdfSearchable': 'pdf',
'pdfTextAndImages': 'pdf',
'pdfa': 'pdf',
'alto': 'xml'
}.get(formatString, formatString)





parser = argparse.ArgumentParser( description="Recognize a file via web service" )
parser.add_argument( 'sourceFile' )
parser.add_argument( 'targetFile' )

parser.add_argument( '-l', '--language', default='English', help='Recognition language (default: %(default))' )
group = parser.add_mutually_exclusive_group()
group.add_argument( '-txt', action='store_const', const='txt', dest='format', default='txt' )
group.add_argument( '-pdf', action='store_const', const='pdfSearchable', dest='format' )
group.add_argument( '-rtf', action='store_const', const='rtf', dest='format' )
group.add_argument( '-docx', action='store_const', const='docx', dest='format' )
group.add_argument( '-xml', action='store_const', const='xml', dest='format' )
parser.add_argument( 'targetDir' )
parser.add_argument( '-l', '--language', default='English',
help='Recognition language (default: %(default)s)' )
parser.add_argument( '-f', '--format', action='append', default=[], type=OutputFormat.type )

args = parser.parse_args()
if not args.format:
args.format.append( OutputFormat.default )
elif len( args.format ) > 3:
parser.error( 'use at most 3 output formats' )


sourceFile = args.sourceFile
targetFile = args.targetFile
targetDir = args.targetDir
language = args.language
outputFormat = args.format
outputFormats = args.format

if os.path.isfile( sourceFile ):
recognizeFile( sourceFile, targetFile, language, outputFormat )
recognizeFile( sourceFile, targetDir, language, outputFormats )