#!/usr/bin/perl use DBI; #Connect to the database my $dbh = DBI->connect("dbi:SQLite:dbname=file.db","",""); #the list table will store relevant data from the big hathifiles list $dbh->do( <<EOF ); CREATE TABLE list (name text, umid text, handle text, capture_agent text, dist text) EOF #The items table will store data gleaned from the data api's sturcture query $dbh->do( <<EOF ); CREATE TABLE items (orig text, fromrecord text, htid text, itemurl text, rightscode text, lastupdate text, enumcron text, usrightsstring text) EOF #The records table stores some other bib data pulled from the bib api $dbh->do( <<EOF ); CREATE TABLE records (id text, recordurl text, marcxml text, sort_string text, sort_integer integer, sort_real real) EOF #the marc table stores data extracted from the marcxml returned by the bib api $dbh->do( <<EOF ); CREATE TABLE marc (id text, title text, author text, edition text, publication_date text, start integer, end integer) EOF #The following are tables for storing multi-valued fields in the bib record. $dbh->do( 'CREATE TABLE isbns (id text, isbn text)' ); $dbh->do( 'CREATE TABLE issns (id text, issn text)' ); $dbh->do( 'CREATE TABLE lccns (id text, lccn text)' ); $dbh->do( 'CREATE TABLE oclcs (id text, oclc text)' ); $dbh->do( 'CREATE TABLE titles (id text, title text)' );
#!/usr/bin/perl use strict; use warnings; use LWP::Simple; use XML::XPath; use DBI; #This is the same database we set up in the database initialization step. my $dbh = DBI->connect("dbi:SQLite:dbname=file.db","",""); #Prepare the statement for storing data in the list table my $sth = $dbh->prepare( <<EOF ); insert into list (name, umid, handle, capture_agent, dist) values (?, ?, ?, ?, ?) EOF #Prepare a statement for checking whether the data already exists in the list table my $done = $dbh->prepare('select count(*) from list where handle=?'); #$data_template will be used in an sprintf with a handle system handle #to generate the REST GET query to the HathiTrust Data API. my $data_template = 'http://services.hathitrust.org/api/htd/structure/%s'; #Open our data file. You'll want a current one with YYYYMMDD updated below. open(IN,'hathi_full_YYYYMMDD.txt'); while( my $line = <IN>) { #Lines from hathi_full_YYYYMMDD.txt are 1-record per line, tab delimited. #See http://www.hathitrust.org/hathifiles_metadata for more details. chomp $line; my @line = split('\t',$line); #We're only interested in public domain materials. next unless $line[2] eq 'pd' || $line[2] eq 'pdus'; #The process takes a while, and we might stop and restart it, # so skip this volume if we've already done it. $done->execute($line[0]); next if $done->fetchrow_arrayref()->[0] > 0; #The first field is a volume identifer/handle, we put that into our API. my $url = sprintf($data_template, $line[0]); print STDERR "fetching $url\n"; my $content = get($url); #You might be interested in better error handling, but this is the basic if( length($content) == 0 ){ die "Failed to fetch data from $url\n"; } #We're using XML::XPath to get the events, with interest in the "capture" event. my $xp = XML::XPath->new('xml' => $content); if($xp) { my $nodes = $xp->find('//PREMIS:event'); if($nodes) { foreach my $node ($nodes->get_nodelist) { if($node) { my $type = $node->findvalue('PREMIS:eventType'); my $agent = $node->findvalue('PREMIS:linkingAgentIdentifier/PREMIS:linkingAgentIdentifierValue'); if($type eq 'capture') { #it looks like we found the capture agent. Let's throw this into the database. #Note that there may be multiple capture events. The thus the database # I'm using isn't completely normalized. $sth->execute($line[11],$line[3],$line[0],$agent,$line[2]); } } } } } #I'm going to try to be polite for the thousands # of requests I'll be running against this server. sleep 3; } close(IN);
#!/usr/bin/perl use strict; use warnings; use LWP::Simple; use XML::XPath; use DBI; #This code will be largely the same as the first pass through the database #The difference is, we'll fix existing records when we get to them. #This is the same database we set up in the database initialization step. my $dbh = DBI->connect("dbi:SQLite:dbname=file.db","",""); #Prepare the statement for storing data in the list table my $sth = $dbh->prepare( <<EOF ); insert into list (name, umid, handle, capture_agent, dist) values (?, ?, ?, ?, ?) EOF #delete from the list so that the new data can be processed. #Remember this table isn't completely normalized so a simple update here would not suffice. my $delete = $dbh->prepare('delete from list where handle=?'); #$data_template will be used in an sprintf with a handle system handle #to generate the REST GET query to the HathiTrust Data API. my $data_template = 'http://services.hathitrust.org/api/htd/structure/%s'; #Open our data file. You'll want a current one with YYYYMMDD updated below. open(IN,'hathi_upd_YYYYMMDD.txt'); while( my $line = <IN>) { #Lines from hathi_full_YYYYMMDD.txt are 1-record per line, tab delimited. #See http://www.hathitrust.org/hathifiles_metadata for more details. chomp $line; my @line = split('\t',$line); #Either the data's already been there, and we want to update it # Or the record is no longer marked as public domain, so we want to delete it # Or it isn't in the database, in which case the delete doesn't match anything # So we're just going to delete every record we see in the update as they come $delete->execute($line[0]); #We're only interested in public domain materials. next unless($line[2] eq 'pd' || $line[2] eq 'pdus') ; #The first field is a volume identifer/handle, we put that into our API. my $url = sprintf($data_template, $line[0]); print STDERR "fetching $url\n"; my $content = get($url); #You might be interested in better error handling, but this is the basic if( length($content) == 0 ){ die "Failed to fetch data from $url\n"; } #We're using XML::XPath to get the events, with interest in the "capture" event. my $xp = XML::XPath->new('xml' => $content); if($xp) { my $nodes = $xp->find('//PREMIS:event'); if($nodes) { foreach my $node ($nodes->get_nodelist) { if($node) { my $type = $node->findvalue('PREMIS:eventType'); my $agent = $node->findvalue('PREMIS:linkingAgentIdentifier/PREMIS:linkingAgentIdentifierValue'); if($type eq 'capture') { #it looks like we found the capture agent. Let's throw this into the database. #Note that there may be multiple capture events. The thus the database # I'm using isn't completely normalized. $sth->execute($line[11],$line[3],$line[0],$agent,$line[2]); } } } } } #I'm going to try to be polite for the thousands # of requests I'll be running against this server. sleep 3; } close(IN);
#!/usr/bin/perl use strict; use warnings; use LWP::Simple; use XML::XPath; use DBI; use JSON; use MARC::File::XML; use MARC::Charset; use Encode; MARC::Charset->ignore_errors(1); binmode STDOUT, ":utf8"; my $dbh = DBI->connect("dbi:SQLite:dbname=file.db","",""); #These are the records for which we want bibliographic data my $sth = $dbh->prepare( <<EOF ); select distinct name, umid, handle, dist, capture_agent from list where handle not in (select handle from list where capture_agent=\'Google, Inc.\') and umid not in (select id from records) EOF #$bib_template will be used in an sprintf with a 9-digit HathiTrust record id #to generate the REST GET query to the HathiTrust Bibliographic API. my $bib_template = 'http://catalog.hathitrust.org/api/volumes/full/umid/%s.json'; #Once we gather the bib data, we'll store it; my $insert_records = $dbh->prepare('insert into records (id, recordurl, marcxml) values (?, ?, ?)'); my $insert_titles = $dbh->prepare('insert into titles (id, title) values (?, ?)'); my $insert_isbns = $dbh->prepare('insert into isbns (id, isbn) values (?, ?)'); my $insert_issns = $dbh->prepare('insert into issns (id, issn) values (?, ?)'); my $insert_oclcs = $dbh->prepare('insert into oclcs (id, oclc) values (?, ?)'); my $insert_lccns = $dbh->prepare('insert into lccns (id, lccn) values (?, ?)'); my $insert_items = $dbh->prepare( <<EOF ); insert into items (fromrecord, orig, htid, itemurl, rightscode, lastupdate, enumcron, usrightsstring) values (?, ?, ?, ?, ?, ?, ?, ?) EOF my $insert_marc = $dbh->prepare( <<EOF ); insert into marc (id, title, author, edition, publication_date,start,end) values (?, ?, ?, ?, ?, ?, ?) EOF #We're going to clean out old data. The number of documents we're gathering data on is relatively small, #So we aren't worried about processing time at this point. my $delete_records = $dbh->prepare('delete from records'); my $delete_titles = $dbh->prepare('delete from titles'); my $delete_isbns = $dbh->prepare('delete from isbns'); my $delete_issns = $dbh->prepare('delete from issns'); my $delete_oclcs = $dbh->prepare('delete from oclcs'); my $delete_lccns = $dbh->prepare('delete from lccns'); my $delete_items = $dbh->prepare('delete from items'); my $delete_marc = $dbh->prepare('delete from marc'); $delete_records->execute(); $delete_titles->execute(); $delete_isbns->execute(); $delete_issns->execute(); $delete_oclcs->execute(); $delete_lccns->execute(); $delete_items->execute(); $delete_marc->execute(); $delete_records->finish(); $delete_titles->finish(); $delete_isbns->finish(); $delete_issns->finish(); $delete_oclcs->finish(); $delete_lccns->finish(); $delete_items->finish(); $delete_marc->finish(); #get our list of records for which we want bibliographic data, and loop through it $sth->execute(); while(my @row = $sth->fetchrow_array) { #put our recordid to generate the REST GET request. my $x = get(sprintf($bib_template, $row[1])); #make sure we got data before trying to use it. if(defined($x) && length($x) > 0) { #the data comes back in json, so decode it. my $y = from_json($x); #making sure that we got valid-looking data, we'll throw it into a database if(defined($y->{'records'})) { foreach my $id (keys %{$y->{'records'}}) { #Since we've got data, let's store the single-valued fields for this record. $insert_records->execute( $id, $y->{'records'}{$id}{'recordURL'}, $y->{'records'}{$id}{'marc-xml'} ); #We asked for the marc-xml version of the bib data, let's process that, #and store dat afrom that too, if we can. my $marc = MARC::File::XML->decode(encode_utf8($y->{'records'}{$id}{'marc-xml'})); if($marc) { my $start = -1; my $end = -1; if($marc->publication_date) { #The publication dates weren't very clean, some had ['s, #some didn't, some had ranges, some didn't. I understand that's #how bibliographic data is. Here I try to clean them up a bit. my $tmp = $marc->publication_date; $tmp =~ s/\(v\.[^)]*\)//g; $tmp =~ s/[^0-9- ]//g; my @tmp = split(/ +/,$tmp); $tmp = ''; foreach my $t (@tmp) { $tmp = $t if (length($t) > length($tmp)); } @tmp = split(/-+/,$tmp); if(scalar(@tmp) == 1) { $start = $tmp[0]; $end = $tmp[0]; } elsif(scalar(@tmp) == 2) { $start = $tmp[0]; if(length($start) == 0) { $start = $tmp[1]; } if(length($tmp[1]) < length($start)) { $end = substr($start,0,length($start)-length($tmp[1])) . $tmp[1]; } else { $end = $tmp[1]; } } #Store what we can from the marc, if the publication date was a date range, then let's save # the start and end in their own fields. $insert_marc->execute( $id, $marc->title, $marc->author, $marc->edition, $marc->publication_date, $start, $end ); } #Now I'll put the multiple-valued fields into tables. foreach my $title (@{$y->{'records'}{$id}{'titles'}}) { $insert_titles->execute($id, $title); } foreach my $isbn (@{$y->{'records'}{$id}{'isbns'}}) { $insert_isbns->execute($id, $isbn); } foreach my $issn (@{$y->{'records'}{$id}{'issns'}}) { $insert_issns->execute($id, $issn); } foreach my $oclc (@{$y->{'records'}{$id}{'oclcs'}}) { $insert_oclcs->execute($id, $oclc); } foreach my $lccn (@{$y->{'records'}{$id}{'lccns'}}) { $insert_lccns->execute($id, $lccn); } } } #records can be linked to multiple items. For this would happen with multivolume works. #we want to store the data about these items. if(defined($y->{'items'})) { foreach my $item (@{$y->{'items'}}) { $insert_items->execute( $item->{'fromRecord'}, $item->{'orig'}, $item->{'htid'}, $item->{'itemURL'}, $item->{'rightsCode'}, $item->{'lastUpdate'}, $item->{'enumcron'}, $item->{'usRightsString'} ); } } } sleep 3; }