-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata.py
29 lines (25 loc) · 902 Bytes
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 30 14:44:16 2017
@author: Vishnu
"""
from Cleaning import clean
import PyPDF2
# =============================================================================
# module to read pdf file and convert it to text page by page
# =============================================================================
def Data(path):
full_data = []
for i in path:
pdf_file = open(i, 'rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
data = []
for i in range(number_of_pages):
page = read_pdf.getPage(i)
page_content = page.extractText()
page_content = clean(page_content)
data.append(page_content)
full_data.append(data)
main_data = [j for i in full_data for j in i if j != '' and len(j.split()) > 10]
return main_data