-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDF-to-Text-D.py
38 lines (27 loc) · 1.41 KB
/
PDF-to-Text-D.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import PyPDF2
import os
# Directory where the PDF files are located
pdf_directory = '/path/to/pdf/files' # Replace with the actual directory path
# Output directory for the combined text file
output_directory = '/path/to/output/directory' # Replace with the desired output directory path
# Name of the combined text file
combined_text_file_name = 'combined_text.txt' # Replace with the desired name of the combined text file
# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)
# Initialize a variable to store the combined text
combined_text = ""
for root, _, files in os.walk(pdf_directory):
for pdf_file_name in files:
if pdf_file_name.endswith('.pdf'):
pdf_file_path = os.path.join(root, pdf_file_name)
pdf_reader = PyPDF2.PdfReader(pdf_file_path)
for page_num, page in enumerate(pdf_reader.pages, start=1):
page_text = page.extract_text()
# Append extracted text to the combined text
combined_text += page_text
# Create the output text file path in the specified output directory
output_text_file_path = os.path.join(output_directory, combined_text_file_name)
# Write the combined text to the output text file
with open(output_text_file_path, 'w', encoding='utf-8') as text_file:
text_file.write(combined_text)
print(f"Text extracted from PDFs and saved to {output_text_file_path}")