-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.pl
executable file
·100 lines (87 loc) · 2.37 KB
/
scraper.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env perl
# Copyright 2014-2015 Michal Špaček <[email protected]>
# Pragmas.
use strict;
use warnings;
# Modules.
use Database::DumpTruck;
use Encode qw(decode_utf8 encode_utf8);
use English;
use HTML::TreeBuilder;
use LWP::UserAgent;
use POSIX qw(strftime);
use URI;
use Time::Local;
# Version.
our $VERSION = 0.02;
# Don't buffer.
$OUTPUT_AUTOFLUSH = 1;
# URI of service.
my $base_uri = URI->new('https://uverejnovani.cz/profiles/details/statutarni-mesto-brno-mestska-cast-brno-sever');
# Open a database handle.
my $dt = Database::DumpTruck->new({
'dbname' => 'data.sqlite',
'table' => 'data',
});
# Create a user agent object.
my $ua = LWP::UserAgent->new(
'agent' => 'Mozilla/5.0',
);
$ua->ssl_opts(
'verify_hostname' => 0,
'SSL_verify_mode' => 0x00,
);
# Get base root.
print 'Page: '.$base_uri->as_string."\n";
my $root = get_root($base_uri);
# Look for items.
my $table = $root->find_by_attribute('class', 'list pz_offers');
my @tr = $table->find_by_tag_name('tbody')->find_by_tag_name('tr');
foreach my $tr (@tr) {
my ($id, $name, $type, $published) = map {
$tr->find_by_attribute('class', $_)->as_text;
} qw(id name type published);
my $link = $base_uri->scheme.'://'.$base_uri->host.
$tr->find_by_attribute('class', 'actions')
->find_by_tag_name('a')->attr('href');
$published = get_db_datetime($published);
# Save.
my $ret_ar = eval {
$dt->execute('SELECT COUNT(*) FROM data WHERE ID = ?',
$id);
};
if ($EVAL_ERROR || ! @{$ret_ar} || ! exists $ret_ar->[0]->{'count(*)'}
|| ! defined $ret_ar->[0]->{'count(*)'}
|| $ret_ar->[0]->{'count(*)'} == 0) {
print encode_utf8("$id: $name\n");
$dt->insert({
'ID' => $id,
'Name' => $name,
'Type' => $type,
'Published' => $published,
'Link' => $link,
});
}
}
# Get DB date from web datetime.
sub get_db_datetime {
my $datetime = shift;
my ($day, $mon, $year, $hour, $min, $sec) = $datetime
=~ m/^\s*(\d+)\.(\d+)\.(\d+)\s+(\d+):(\d+):(\d+)\s*$/ms;
my $time = timelocal($sec, $min, $hour, $day, $mon - 1, $year - 1900);
return strftime('%Y-%m-%d', localtime($time));
}
# Get root of HTML::TreeBuilder object.
sub get_root {
my $uri = shift;
my $get = $ua->get($uri->as_string);
my $data;
if ($get->is_success) {
$data = $get->content;
} else {
die "Cannot GET '".$uri->as_string." page.";
}
my $tree = HTML::TreeBuilder->new;
$tree->parse(decode_utf8($data));
return $tree->elementify;
}