-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathed-devtracker-collector.pl
executable file
·219 lines (200 loc) · 7.12 KB
/
ed-devtracker-collector.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/perl -w -I.
# vim: textwidth=0 wrapmargin=0 shiftwidth=2 tabstop=2 expandtab softtabstop
use strict;
use Encode;
use Data::Dumper;
use JSON::PP;
use LWP;
use HTTP::Cookies;
use Digest::MD5 qw(md5_hex);
use File::Flock;
use ED::DevTracker::Config;
use ED::DevTracker::DB;
use ED::DevTracker::Scrape;
use ED::DevTracker::RSS;
$ENV{'TZ'} = 'UTC';
my $config = ED::DevTracker::Config->new(file => "config.txt");
if (!defined($config)) {
die "No config!\n";
}
my $lock = new File::Flock("ed-devtracker-collector.lock", undef, "nonblocking");
if (! $lock) {
die "Couldn't obtain lock\n";
}
my $db = new ED::DevTracker::DB('config' => $config);
my $ua = LWP::UserAgent->new('agent' => $config->getconf('user_agent'));
$ua->timeout($config->getconf('ua_timeout'));
$ua->cookie_jar(HTTP::Cookies->new(file => "lwpcookies.txt", autosave => 1, ignore_discard => 1));
my $rss_filename = $config->getconf('self_url');
$rss_filename =~ s/^(.+)\/([^\/]+)/$2/;
if (! -f $rss_filename) {
my $cwd = `pwd`;
chomp($cwd);
printf STDERR "RSS file %s doesn't exist at %s, did you forget to cd before running this script?\n", $rss_filename, $cwd;
exit(4);
}
my $developers;
{
local $/ = undef;
if (!open(MEMBERIDS, $config->getconf('memberid_file'))) {
printf STDERR "Failed to open memberid file '%s'\n", $config->getconf('memberid_file');
exit(-1);
}
binmode MEMBERIDS;
my $member_ids = <MEMBERIDS>;
close(MEMBERIDS);
#print STDERR $member_ids, "\n";
$developers = decode_json($member_ids);
#print STDERR Dumper($developers);
# print STDERR Dumper( map { if ($_->{'active'}) { $_->{'memberid'}; } } @{$developers->{'members'}});
#exit(0);
}
my @forums_ignored;
{
local $/ = undef;
if (!open(FORUMIGNORES, $config->getconf('forum_ignore_file'))) {
printf STDERR "Failed to open forum ignore file '%s'\n", $config->getconf('forum_ignore_file');
exit(-1);
}
binmode FORUMIGNORES;
my $forums_ignored_input = <FORUMIGNORES>;
close(FORUMIGNORES);
# print STDERR $forums_ignored_urls, "\n";
my $forums_ignored_json = decode_json($forums_ignored_input);
# print STDERR Dumper($forums_ignored_json);
foreach my $f (keys(%{$forums_ignored_json})) {
#printf STDERR "Forum ignore: %s\n", $f;
push(@forums_ignored, $forums_ignored_json->{$f}->{'id'});
}
#print Dumper(sort(@forums_ignored));
#exit(0);
}
###########################################################################
# First let's make sure we're logged in.
###########################################################################
# Load the login page to get cookies set up
my $login_form_url = $config->getconf('forum_base_url') . "/login";
my $lf_req = HTTP::Request->new('GET', $login_form_url, ['Connection' => 'close']);
my $lf_res = $ua->request($lf_req);
#print $lf_res->as_string;
#print Dumper($lf_res->content);
my $tree = HTML::TreeBuilder->new(no_space_compacting => 1);
$tree->parse($lf_res->decoded_content());
$tree->eof();
my $xft = $tree->look_down('name', '_xfToken');
if (! $xft) {
print STDERR $lf_res->decoded_content(), "\n";
die("Failed login: can't find _xfToken");
} #else {
# printf STDERR "Got _xfToken\n";
# exit(0);
#}
my $xfToken = $xft->attr_get_i('value');
undef $tree;
my $login_url = $config->getconf('forum_base_url') . "/login/login";
my $login_user = $config->getconf('forum_user');
my $login_password = $config->getconf('forum_password');
my $req = HTTP::Request->new('POST', $login_url, ['Connection' => 'close']);
$req->header('Origin' => $config->getconf('forum_base_url') . "/login");
$req->header('Referer' => $config->getconf('forum_base_url') . "/login");
$req->header('Content-Type' => 'application/x-www-form-urlencoded');
$req->content(
"login=" . $login_user
. "&password=" . $login_password
. "&remember=1"
. "&_xfRedirect=/"
. "&_xfToken=" . $xfToken
);
#print STDERR Dumper($req), "\n";
#print STDERR $req->as_string, "\n";
#exit(0);
my $res = $ua->request($req);
#print STDERR $res->as_string;
#print STDERR Dumper($res->content);
# In XenForo 2 HTTP 303 (See Other) indicates login success
# HTTP 200 (success) means "already logged in"
if ($res->code != 303 and ! $res->is_success) {
print STDERR "Failed to login: ", $res->status_line, "\n";
exit(1);
}
#exit(0);
#printf STDERR "Login done\n";
#exit(0);
###########################################################################
my $new_posts_total = 0;
# $new_posts_total = 1; goto RSS_OUTPUT;
my $scrape = new ED::DevTracker::Scrape($ua, \@forums_ignored);
foreach my $whoid ( sort({$a <=> $b} map { $_->{'memberid'} } grep { $_->{'active'} } @{$developers->{'members'}})) {
my $err;
# if ($whoid < 106358) { next; }
# print STDERR "Scraping id ", $whoid, "\n";
my $bail = 99999999;
if ($whoid > $bail) {
print STDERR "Bailing after id ", $bail, "\n";
last;
}
my $membername = sprintf("%s", map {$_->{'membername'}} grep { $_->{'memberid'} eq $whoid } @{$developers->{'members'}});
my $new_posts = $scrape->get_member_new_posts($whoid, $membername);
# if ($err) {
# die("Failed post: $post{'url'}\n");
# }
# We're popping off an array so as to reverse the order we found them
# else they'll go in the DB in the wrong order.
#print STDERR "Adding posts for ", $whoid, " START\n";
my $p = pop(@{$new_posts});
while (defined($p)) {
#print STDERR Dumper($p), "\n";
if (${$p}{'datestamp'}) {
$db->insert_post($p);
}
$new_posts_total++;
$p = pop(@{$new_posts});
}
# printf STDERR "new_posts_total now: %d\n", $new_posts_total;
}
RSS_OUTPUT:
if ($new_posts_total > 0) {
#printf "Found %d new posts.\n", $new_posts_total;
generate_rss_file('false', $config->getconf('self_url'));
generate_rss_file('true', $config->getconf('self_fulltext_url'));
}
# Sleep to be sure we don't run back to back if the forums are straining
if (defined($config->getconf('sleep_after')) and $config->getconf('sleep_after') > 0 ) {
# printf STDERR "Sleeping for %d seconds\n", $config->getconf('sleep_after');
sleep($config->getconf('sleep_after'));
}
exit(0);
###########################################################################
# Generate an RSS file, either with or without fulltext
###########################################################################
sub generate_rss_file {
my ($fulltext, $self_url) = @_;
my $rss = new ED::DevTracker::RSS($fulltext, $self_url);
if (! $rss->generate()) {
printf STDERR "Something failed in RSS generation.\n";
exit(1);
} else {
# print STDERR "Generation good\n";
}
$rss_filename = $self_url;
$rss_filename =~ s/^(.+)\/([^\/]+)/$2/;
my $tmp_name = $rss_filename . ".tmp";
if (!open(TMP, ">:encoding(utf-8)", "$tmp_name")) {
print STDERR "Couldn't open temporary file '", $tmp_name, "': ", $!, "\n";
exit(2);
}
# Turn on auto-flush, to be SURE those changes are on disk by the time
# anything else reads them.
my $old_fh = select(TMP);
$| = 1;
select($old_fh);
if (!print TMP $rss->output) {
print STDERR "Error writing to tmp RSS file '", $tmp_name, "': ", $!, "\n";#
exit(3);
}
close(TMP);
# mv tmp to live
rename($tmp_name, $rss_filename);
chmod(0644, $rss_filename);
}
###########################################################################