-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
97 lines (81 loc) · 3.33 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
from requests.auth import HTTPBasicAuth
import config
import os
class GithubScraper:
def __init__(self, repo_url):
self.repo_url = repo_url
parts = repo_url.rstrip('/').split('/')
self.owner = parts[-2]
self.repo = parts[-1]
self.coding_language = [
("Python", ".py"),
("Java", ".java"),
("JavaScript", ".js"),
("C", ".c"),
("C++", ".cpp"),
("C#", ".cs"),
("Ruby", ".rb"),
("PHP", ".php"),
("Go", ".go"),
("Swift", ".swift"),
("Kotlin", ".kt"),
("Rust", ".rs"),
("HTML", ".html"),
("CSS", ".css"),
("TypeScript", ".ts"),
("Shell Script", ".sh"),
("Perl", ".pl"),
("R", ".R"),
("Scala", ".scala"),
("Lua", ".lua"),
("Dart", ".dart"),
("Objective-C", ".m"),
("Elixir", ".ex"),
("Haskell", ".hs"),
("MATLAB", ".m"),
("Groovy", ".groovy"),
("Visual Basic", ".vb"),
("Assembly", ".asm"),
]
self.extensions = [
'.py', '.java', '.js', '.c', '.cpp', '.cs', '.rb', '.php', '.go', '.swift',
'.kt', '.rs', '.html', '.css', '.ts', '.sh', '.pl', '.R', '.lua',
'.dart', '.vb', '.asm'
]
self.token = config.CODEGPT_PERSONAL_ACCESS_TOKEN
def get_langExtension(lang):
return self.coding_language.get(lang, "unknown")
def get_files(self, path=''):
url = f'https://api.github.com/repos/{self.owner}/{self.repo}/contents/{path}'
headers = {'Accept': 'application/vnd.github.v3+json'}
auth = HTTPBasicAuth('username', self.token) if self.token else None
response = requests.get(url, headers=headers, auth=auth)
response.raise_for_status()
contents = response.json()
files = []
for item in contents:
if item['type'] == 'file':
if self.extensions is None or item['path'].endswith(tuple(self.extensions)):
files.append(item['path'])
print('[!] retrieving', item['path'])
elif item['type'] == 'dir':
files.extend(self.get_files(item['path']))
return files
def download_files(self, save_dir):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
files = self.get_files()
base_url = f'https://api.github.com/repos/{self.owner}/{self.repo}/contents/'
headers = {'Accept': 'application/vnd.github.v3.raw'}
auth = HTTPBasicAuth('username', self.token) if self.token else None
for file_path in files:
file_url = base_url + file_path
response = requests.get(file_url, headers=headers, auth=auth)
response.raise_for_status()
filename = file_path.replace('/', '\\')
txt_filename = f'{filename}.txt'
save_path = os.path.join(save_dir, txt_filename)
print(f'[!!] downloading {file_path}')
with open(save_path, 'w', encoding='utf-8') as write_file:
write_file.write(response.text)