-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
97 lines (86 loc) · 2.95 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import pandas as pd
from urllib.request import Request, urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import ssl
import trafilatura
import json
from tqdm import tqdm
def get_sitemap(url):
req = Request(
url=url,
headers={"User-Agent": "Mozilla/5.0"}
)
response = urlopen(req)
xml = BeautifulSoup(
response,
"lxml-xml",
from_encoding=response.info().get_param("charset")
)
return xml
def sitemap_to_dataframe(xml, name=None, data=None, verbose=False):
df = pd.DataFrame(columns=["loc", "changefreq", "priority", "domain", "sitemap_name"])
urls = xml.find_all("url")
for url in urls:
if xml.find("loc"):
loc = url.findNext("loc").text
parsed_uri = urlparse(loc)
domain = "{uri.netloc}".format(uri=parsed_uri)
else:
loc = ""
domain = ""
if xml.find("changefreq"):
changefreq = url.findNext("changefreq").text
else:
changefreq = ""
if xml.find("priority"):
priority = url.findNext("priority").text
else:
priority = ""
if name:
sitemap_name = name
else:
sitemap_name = ""
row = {
"domain": domain,
"loc": loc,
"changefreq": changefreq,
"priority": priority,
"sitemap_name": sitemap_name,
}
if verbose:
print(row)
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
return df
def extract_text_from_url(url):
downloaded_url = trafilatura.fetch_url(url)
extracted = trafilatura.extract(
downloaded_url,
output_format="json",
with_metadata=True,
include_comments = False,
date_extraction_params={"extensive_search": True, "original_date": True}
)
json_output = json.loads(extracted)
return json_output["text"]
def main():
ssl._create_default_https_context = ssl._create_stdlib_context
url = "https://zerodha.com/varsity/chapter-sitemap2.xml"
xml = get_sitemap(url)
df = sitemap_to_dataframe(xml, verbose=False)
urls = df["loc"].to_numpy()
urls = [url for url in urls if "%" not in url]
with open("./chapters.txt", "w") as f:
for url in tqdm(urls[1:]):
topic = url.split("/")[-2]
if "hindi" in topic or topic in ["the-vegetable-list", "bonus-share-vs-stock-split", "getting-started-2"]:
continue
text = extract_text_from_url(url=url)
text = text.lower()
text = text.replace("key takeaways from this chapter", "")
text = text.replace("we recommend reading this chapter on varsity to learn more and understand the concepts in-depth.", "")
text = text.replace("varsity", "")
f.writelines(topic + "\n")
f.writelines(text + "\n###\n")
if __name__ == "__main__":
main()