Lawrence Technological University
College of Arts and Science
Department of Mathematics and Computer Sciences

Handouts

A Webscraping example

To find some old, old closing prices of some stocks you can go to Big Charts and request one day at a time and write down the answer. Ok for a few days, but for 20 years of days, a little automation might help.

 
#! /usr/bin/perl -w
# bigchart.pl
# program to scrape some data off the BigCharts Web site
use DBI;
use LWP::UserAgent;
use Time::Local;

# ---------- date subroutines
# constants
@dd = ('00' .. '31');
@mm = ('01' .. '12');
@wdays = qw( Sunday Monday Tuesday Wednesday Thursday Friday Saturday );
sub today_in_sql {
  my @dt = localtime(time);
  return ($dt[5] + 1900) . '-' . $mm[$dt[4]] . '-' . $dd[$dt[3]];
}
sub sql_to_mdy {
  if ($_[0] =~ /^(\d{4})-(\d{1,2})-(\d{1,2})$/) {
    return "$2-$3-$1";
  } else {
    return $_[0];
  }
}
sub sql_to_m_d_y {
  if ($_[0] =~ /^(\d{4})-(\d{1,2})-(\d{1,2})$/) {
    return ($2,$3,$1);
  } else {
    return ();
  }
}
sub month_no {
  my $no = 12;
  my $m;
  my $month = shift;
  foreach $m ('dec','nov','oct','sep','aug','jul',
           'jun','may','apr','mar','feb','jan') {
    if ($month =~ /$m/i) {
      return $no;
    }
    $no--;
  }
  return $no;
}
sub weekday { # argument sql date
  my ($year,$month,$day) = split /-/, shift;
  my ($sec,$min,$hr) = (0,0,12);
  return $wdays[
    (localtime (timelocal ($sec,$min,$hr,$day,$month - 1,$year - 1900)))[6]];
}
sub weekday_no { # argument sql date, returns 0-6
  my ($year,$month,$day) = split /-/, shift;
  my ($sec,$min,$hr) = (0,0,12);
  return
    (localtime (timelocal ($sec,$min,$hr,$day,$month - 1,$year - 1900)))[6];
}
sub first_day { # arguments month(0-11),year(yyyy) returns 0-6
  my ($month,$year) = @_;
  my ($sec,$min,$hr,$day) = (0,0,12,1);
  return (localtime (timelocal ($sec,$min,$hr,$day,$month,$year - 1900)))[6];
}
sub days_in_month { # month, year
  @daytbl = (0,31,28,31,30,31,30,31,31,30,31,30,31,29);
  my $day = 0;
  if ($_[0] == 2 and ($_[1] % 400 == 0 or
                     ($_[1] % 4 == 0 and $_[1] % 100 != 0))) {
    $day = $daytbl[13];
  } elsif ($_[0] > 0 and $_[0] < 13) {
    $day = $daytbl[$_[0]];
  }
  return $day;
}
sub next_day { # argument and return is SQL date
  my($y,$m,$d) = split /-/, shift;
  # forward 1 day
  $d++;
  # adjust if to next month
  if ($d > days_in_month($m,$y)) {
    $m++;
    # adjust if to next year
    if ($m > 12) {
      $y++;
      $m = 1;
    }
    $d = 1;
  }
  return "$y-$mm[$m - 1]-$dd[$d]";
}
sub prev_day { # argument and return is SQL date
  my($y,$m,$d) = split /-/, shift;
  # backward 1 day
  $d--;
  # adjust if to prev month
  if ($d < 1) {
    $m--;
    # adjust if to prev year
    if ($m < 1) {
      $y--;
      $m = 12;
    }
    $d = days_in_month($m,$y);
  }
  return "$y-$mm[$m - 1]-$dd[$d]";
}
sub display_month_day { # argument SQL date
  my @month = qw{January February March April May June
                 July August September October November December};
  my (undef,$m,$d) = split /-/, shift;
  my $dt = $month[$m - 1] . ' ' . (0 + $d);
  return $dt;
}
sub display_full_date { # argument SQL date
  my @month = qw{January February March April May June
                 July August September October November December};
  my ($y,$m,$d) = split /-/, shift;
  $d += 0;
  return "$month[$m - 1] $d, $y";
}

$dbh = DBI->connect('DBI:PgPP:scudder:192.168.1.11','user','password')
   or die "Problem connecting";
$ua = LWP::UserAgent->new;
$date = '1970-01-01';
while ($date le &today_in_sql) {
  ($mo, $dy, $yr) = sql_to_m_d_y($date);
  $qry_str = qq{?detect=1&symbol=ktcax&close_date=$mo%2F$dy%2F$yr&x=0&y=0};
  $req = HTTP::Request->new(GET =>
       'http://bigcharts.marketwatch.com/historical/default.asp' . $qry_str);
  $ans = $ua->request($req)->as_string;
  if ($ans =~ /((No data for KTCAX)|(KTCAX not traded on))/) {
    $close = 'null';
    $status = $dbh->quote($1);
  } else {
    if ($ans =~ /No Split/) {
      $status = $dbh->quote('No Split');
    } else {
      $status = 'null';
    }
    $ans =~ /Closing Price:.*?<B>(\d{1,3}(?:\.\d{1,3})?)/s;
    $close = $1 ? $1 : 'null';
  }
  print "$date, $close, $status \n";
  $insert = qq{insert into atclose (day,price,status)
               values ('$date',$close,$status)};
  $dbh->do($insert)
    or die "Problem with $insert";
  $date = next_day($date);
}
# an excerpt from the Web page
# <SUP>Closing Price:</SUP></FONT></TD>
# <TD ALIGN="right" WIDTH="65%">
#    <FONT FACE="Arial,Helvetica,sans-serif" SIZE="2"  COLOR="#ffffcc">
#    <B>6.75&nbsp;</B></FONT></TD>

Revised October 31, 2004