forked from aws-samples/amazon-textract-code-samples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path07-search.py
65 lines (50 loc) · 1.79 KB
/
07-search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import boto3
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
def indexDocument(bucketName, objectName, text):
# Update host with endpoint of your Elasticsearch cluster
#host = "search--xxxxxxxxxxxxxx.us-east-1.es.amazonaws.com
host = "searchxxxxxxxxxxxxxxxx.us-east-1.es.amazonaws.com"
region = 'us-east-1'
if(text):
service = 'es'
ss = boto3.Session()
credentials = ss.get_credentials()
region = ss.region_name
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
es = Elasticsearch(
hosts = [{'host': host, 'port': 443}],
http_auth = awsauth,
use_ssl = True,
verify_certs = True,
connection_class = RequestsHttpConnection
)
document = {
"name": "{}".format(objectName),
"bucket" : "{}".format(bucketName),
"content" : text
}
es.index(index="textract", doc_type="document", id=objectName, body=document)
print("Indexed document: {}".format(objectName))
# Document
s3BucketName = "ki-textract-demo-docs"
documentName = "simple-document-image.jpg"
# Amazon Textract client
textract = boto3.client('textract')
# Call Amazon Textract
response = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': s3BucketName,
'Name': documentName
}
})
#print(response)
# Print detected text
text = ""
for item in response["Blocks"]:
if item["BlockType"] == "LINE":
print ('\033[94m' + item["Text"] + '\033[0m')
text += item["Text"]
indexDocument(s3BucketName, documentName, text)
# You can view index documents in Kibana Dashboard