Lawrence Technological University
College of Arts and Science
Department of Mathematics and Computer Sciences

### A Webscraping example

To find some old, old closing prices of some stocks you can go to Big Charts and request one day at a time and write down the answer. Ok for a few days, but for 20 years of days, a little automation might help.

```
#! /usr/bin/perl -w
# bigchart.pl
# program to scrape some data off the BigCharts Web site
use DBI;
use LWP::UserAgent;
use Time::Local;

# ---------- date subroutines
# constants
@dd = ('00' .. '31');
@mm = ('01' .. '12');
@wdays = qw( Sunday Monday Tuesday Wednesday Thursday Friday Saturday );
sub today_in_sql {
my @dt = localtime(time);
return (\$dt + 1900) . '-' . \$mm[\$dt] . '-' . \$dd[\$dt];
}
sub sql_to_mdy {
if (\$_ =~ /^(\d{4})-(\d{1,2})-(\d{1,2})\$/) {
return "\$2-\$3-\$1";
} else {
return \$_;
}
}
sub sql_to_m_d_y {
if (\$_ =~ /^(\d{4})-(\d{1,2})-(\d{1,2})\$/) {
return (\$2,\$3,\$1);
} else {
return ();
}
}
sub month_no {
my \$no = 12;
my \$m;
my \$month = shift;
foreach \$m ('dec','nov','oct','sep','aug','jul',
'jun','may','apr','mar','feb','jan') {
if (\$month =~ /\$m/i) {
return \$no;
}
\$no--;
}
return \$no;
}
sub weekday { # argument sql date
my (\$year,\$month,\$day) = split /-/, shift;
my (\$sec,\$min,\$hr) = (0,0,12);
return \$wdays[
(localtime (timelocal (\$sec,\$min,\$hr,\$day,\$month - 1,\$year - 1900)))];
}
sub weekday_no { # argument sql date, returns 0-6
my (\$year,\$month,\$day) = split /-/, shift;
my (\$sec,\$min,\$hr) = (0,0,12);
return
(localtime (timelocal (\$sec,\$min,\$hr,\$day,\$month - 1,\$year - 1900)));
}
sub first_day { # arguments month(0-11),year(yyyy) returns 0-6
my (\$month,\$year) = @_;
my (\$sec,\$min,\$hr,\$day) = (0,0,12,1);
return (localtime (timelocal (\$sec,\$min,\$hr,\$day,\$month,\$year - 1900)));
}
sub days_in_month { # month, year
@daytbl = (0,31,28,31,30,31,30,31,31,30,31,30,31,29);
my \$day = 0;
if (\$_ == 2 and (\$_ % 400 == 0 or
(\$_ % 4 == 0 and \$_ % 100 != 0))) {
\$day = \$daytbl;
} elsif (\$_ > 0 and \$_ < 13) {
\$day = \$daytbl[\$_];
}
return \$day;
}
sub next_day { # argument and return is SQL date
my(\$y,\$m,\$d) = split /-/, shift;
# forward 1 day
\$d++;
# adjust if to next month
if (\$d > days_in_month(\$m,\$y)) {
\$m++;
# adjust if to next year
if (\$m > 12) {
\$y++;
\$m = 1;
}
\$d = 1;
}
return "\$y-\$mm[\$m - 1]-\$dd[\$d]";
}
sub prev_day { # argument and return is SQL date
my(\$y,\$m,\$d) = split /-/, shift;
# backward 1 day
\$d--;
# adjust if to prev month
if (\$d < 1) {
\$m--;
# adjust if to prev year
if (\$m < 1) {
\$y--;
\$m = 12;
}
\$d = days_in_month(\$m,\$y);
}
return "\$y-\$mm[\$m - 1]-\$dd[\$d]";
}
sub display_month_day { # argument SQL date
my @month = qw{January February March April May June
July August September October November December};
my (undef,\$m,\$d) = split /-/, shift;
my \$dt = \$month[\$m - 1] . ' ' . (0 + \$d);
return \$dt;
}
sub display_full_date { # argument SQL date
my @month = qw{January February March April May June
July August September October November December};
my (\$y,\$m,\$d) = split /-/, shift;
\$d += 0;
return "\$month[\$m - 1] \$d, \$y";
}

or die "Problem connecting";
\$ua = LWP::UserAgent->new;
\$date = '1970-01-01';
while (\$date le &today_in_sql) {
(\$mo, \$dy, \$yr) = sql_to_m_d_y(\$date);
\$qry_str = qq{?detect=1&symbol=ktcax&close_date=\$mo%2F\$dy%2F\$yr&x=0&y=0};
\$req = HTTP::Request->new(GET =>
'http://bigcharts.marketwatch.com/historical/default.asp' . \$qry_str);
\$ans = \$ua->request(\$req)->as_string;
if (\$ans =~ /((No data for KTCAX)|(KTCAX not traded on))/) {
\$close = 'null';
\$status = \$dbh->quote(\$1);
} else {
if (\$ans =~ /No Split/) {
\$status = \$dbh->quote('No Split');
} else {
\$status = 'null';
}
\$ans =~ /Closing Price:.*?<B>(\d{1,3}(?:\.\d{1,3})?)/s;
\$close = \$1 ? \$1 : 'null';
}
print "\$date, \$close, \$status \n";
\$insert = qq{insert into atclose (day,price,status)
values ('\$date',\$close,\$status)};
\$dbh->do(\$insert)
or die "Problem with \$insert";
\$date = next_day(\$date);
}
# an excerpt from the Web page
# <SUP>Closing Price:</SUP></FONT></TD>
# <TD ALIGN="right" WIDTH="65%">
#    <FONT FACE="Arial,Helvetica,sans-serif" SIZE="2"  COLOR="#ffffcc">
#    <B>6.75&nbsp;</B></FONT></TD>
```

Revised October 31, 2004