forked from aws-samples/amazon-textract-code-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path12-pdf-text.py
executable file
·73 lines (57 loc) · 2.06 KB
/
12-pdf-text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import boto3
import time
def start_job(client, s3_bucket_name, object_name):
response = None
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3_bucket_name,
'Name': object_name
}})
return response["JobId"]
def is_job_complete(client, job_id):
time.sleep(1)
response = client.get_document_text_detection(JobId=job_id)
status = response["JobStatus"]
print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(1)
response = client.get_document_text_detection(JobId=job_id)
status = response["JobStatus"]
print("Job status: {}".format(status))
return status
def get_job_results(client, job_id):
pages = []
time.sleep(1)
response = client.get_document_text_detection(JobId=job_id)
pages.append(response)
print("Resultset page received: {}".format(len(pages)))
next_token = None
if 'NextToken' in response:
next_token = response['NextToken']
while next_token:
time.sleep(1)
response = client.\
get_document_text_detection(JobId=job_id, NextToken=next_token)
pages.append(response)
print("Resultset page received: {}".format(len(pages)))
next_token = None
if 'NextToken' in response:
next_token = response['NextToken']
return pages
if __name__ == "__main__":
# Document
s3_bucket_name = "ki-textract-demo-docs"
document_name = "Amazon-Textract-Pdf.pdf"
region = "us-east-1"
client = boto3.client('textract', region_name=region)
job_id = start_job(client, s3_bucket_name, document_name)
print("Started job with id: {}".format(job_id))
if is_job_complete(client, job_id):
response = get_job_results(client, job_id)
# print(response)
# Print detected text
for result_page in response:
for item in result_page["Blocks"]:
if item["BlockType"] == "LINE":
print('\033[94m' + item["Text"] + '\033[0m')