-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
76 lines (60 loc) · 1.88 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Corpora
==============
Collects article data from feeds,
but does not apply any processing on them.
Just stores the data to build a corpus
to be processed later/for later training.
"""
import sys
import json
from corpora.models import Source, Feed
from corpora import collector
import sampler
def load_sources():
"""
Loads sources to construct a corpus from.
The `sources.json` file should have a structure of::
{
'The New York Times': [
'http//www.nytimes.com/services/xml/rss/nyt/World.xml',
'http//www.nytimes.com/services/xml/rss/nyt/politics.xml'
]
}
"""
sources_fs = open('sources.json', 'r')
sources = json.load(sources_fs)
for source_name, feeds in sources.items():
# Get/create the Source.
source = Source.objects(name=source_name).first()
if not source:
source = Source(name=source_name)
source.save()
# Add the feeds.
for feed_url in feeds:
if not Feed.objects(ext_url=feed_url).first():
feed = Feed(ext_url=feed_url, source=source)
feed.save()
def collect():
collector.collect()
def sample():
if len(sys.argv) < 3:
print('Please specify the path to the WikiNews pages-articles XML to sample from.')
return
sampler.sample(sys.argv[2])
def sample_preview():
if len(sys.argv) < 3:
print('Please specify the path to the WikiNews pages-articles XML to sample from.')
return
sampler.sample(sys.argv[2], preview=True)
def main():
if len(sys.argv) < 2:
print('You must specify a command: [load_sources, collect, sample, sample_preview]')
return
try:
globals()[sys.argv[1]]();
except KeyError:
print('Doesn\'t seem to be a valid command.')
return
if __name__ == '__main__':
main()