-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.pl
executable file
·77 lines (67 loc) · 1.71 KB
/
scraper.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env perl
# Copyright 2014 Michal Špaček <[email protected]>
# Pragmas.
use strict;
use warnings;
# Modules.
use Database::DumpTruck;
use Encode qw(decode_utf8 encode_utf8);
use English;
use HTML::TreeBuilder;
use LWP::UserAgent;
use URI;
# Don't buffer.
$OUTPUT_AUTOFLUSH = 1;
# URI of service.
my $base_uri = URI->new('http://www.brno.cz/sprava-mesta/volene-organy-mesta/'.
'zastupitelstvo-mesta-brna/clenove-zastupitelstva-mesta-brna/');
# Open a database handle.
my $dt = Database::DumpTruck->new({
'dbname' => 'data.sqlite',
'table' => 'data',
});
# Create a user agent object.
my $ua = LWP::UserAgent->new(
'agent' => 'Mozilla/5.0',
);
# Get base root.
print 'Page: '.$base_uri->as_string."\n";
my $root = get_root($base_uri);
# Look for items.
my $telo = $root->find_by_attribute('id', 'telo');
my @p = $telo->find_by_tag_name('div')->find_by_tag_name('p');
foreach my $content (@{$p[1]->content}) {
if (ref $content eq 'HTML::Element') {
next;
}
my ($jmeno, $strana) = parse_name($content);
$dt->insert({
'Jmeno' => $jmeno,
'Strana' => $strana,
});
}
# Get root of HTML::TreeBuilder object.
sub get_root {
my $uri = shift;
my $get = $ua->get($uri->as_string);
my $data;
if ($get->is_success) {
$data = $get->content;
} else {
die "Cannot GET '".$uri->as_string." page.";
}
my $tree = HTML::TreeBuilder->new;
$tree->parse(decode_utf8($data));
return $tree->elementify;
}
# Parse name.
sub parse_name {
my $name_string = shift;
$name_string =~ s/^\s*\d+\.\s+//ms;
$name_string =~ s/\x{00a0}/ /ms;
my ($name, $party) = $name_string =~ m/^(.*?)\s+\((.*)\)\s*$/ms;
if ($party eq decode_utf8('ŽTB*')) {
$party = decode_utf8('Žít Brno s podporou Pirátů');
}
return ($name, $party);
}