-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathCleaning.py
25 lines (21 loc) · 855 Bytes
/
Cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 30 14:41:59 2017
@author: Vishnu
"""
import re
# =============================================================================
# module to clean pdf text in a page
# =============================================================================
def clean(raw_data):
raw_data = raw_data.replace('€', '')
raw_data = raw_data.replace('Œ', '')
raw_data = re.sub(r'https?:\/\/.*[\r\n]*', '', raw_data, flags=re.MULTILINE)
raw_data = re.sub(r'\<a href', ' ', raw_data)
raw_data = re.sub(r'&', '', raw_data)
raw_data = re.sub(r'[_"\-;%()|+&=*%!:#$@\[\]/]', ' ', raw_data)
raw_data = re.sub(r'<br />', ' ', raw_data)
raw_data = re.sub(r'\'', ' ', raw_data)
raw_data = raw_data.replace("®", '')
page_content = raw_data.replace(u"\u2122", '')
return page_content