データベーススクリプト

抜粋版ですが公開します。
もし、ニコニコ動画にアクセスして何かしたいという方は、重そうな時間とか午前四時を避けてください。
それから、タグ検索は重たい処理らしいので、やさしくお願いします。
なお、「アイドルマスター」タグを全件取得すると、メモリ500MB使って、ループの終了と開始(for文)*1に3秒くらいかかるようになります。

全体の流れ

  1. ログイン
  2. タグ検索
  3. ビデオID収集
  4. API問い合わせ
  5. XML解析
  6. DB書き込み

ログインからAPI問い合わせまでは割愛します。LWPでログインしてHTMLを取得、正規表現でビデオIDを収集して、それぞれAPIに問い合わせているだけなので。
一応重複取得を避けるためにチェックを入れてます。

# %duplication_check を作っておく
# @list に抜き出したIDを入れておく

# 重複してないIDをこれに入れる
my @video_id_list;

foreach(@list){
	unless ($duplication_check{"$_"}) {
		push (@video_id_list, $_);
	}
	$duplication_check{"$_"}++;
}
my @video_infos = get_video_infos(@video_id_list);

XML解析

サブルーチンの抜粋です。

# In : array of video_id
# Out: array of hash of video_info
#      hash{tags} is ref array of tag text

my $ua = LWP::UserAgent->new( keep_alive => 1 );
my $parser  = new XML::DOM::Parser;

foreach my $video_id (@video_ids) {
	my %hash;

	my $res = $ua->get($api_url);
	unless ($res->is_success) {
			warn "Wait for API.\n";
			sleep 3;
			$res = $ua->get($api_url);
			die "can't get api result.\n" unless $res->is_success;
	}

	my $doc = $parser->parse($res->content);

	my($sec, $min, $hour, $day, $mon, $year) = localtime(time);
	$year += 1900;
	$mon++;
	$hash{'get_time'} = "$year-$mon-${day}T$hour:$min:$sec";

	$hash{'video_id'} = $video_id;

	# DELETED, NOT_FOUNDの処理
	my ($response) = $doc->getElementsByTagName('nicovideo_thumb_response');
	my $api_status = $response->getAttributes->getNamedItem('status')->getNodeValue;
	if ($api_status eq 'fail') {
		my ($code) = $doc->getElementsByTagName('code');
		$hash{'status'} = $code->getFirstChild->getNodeValue;
		next;
	} else {
		$hash{'status'} = 'ok';
	}

	my ($title) = $doc->getElementsByTagName('title');
	if ($title->hasChildNodes) {
			$hash{'title'} = $title->getFirstChild->getNodeValue;
	} else {
			$hash{'title'} = "";
	}

	my ($description) = $doc->getElementsByTagName('description');
	if ($description->hasChildNodes) {
			$hash{'description'} = 
			  $description->getFirstChild->getNodeValue ||
			  warn "Desc: $video_id cant't get description.\n";
	} else {
			$hash{'description'} = "";
	}

	my ($first_retrieve) = $doc->getElementsByTagName('first_retrieve');
	$hash{'first_retrieve'} = $first_retrieve->getFirstChild->getNodeValue;

	my ($length) = $doc->getElementsByTagName('length');
	$hash{'length'} = $length->getFirstChild->getNodeValue;

	my ($view_counter) = $doc->getElementsByTagName('view_counter');
	$hash{'view_counter'} = $view_counter->getFirstChild->getNodeValue;

	my ($comment_num) = $doc->getElementsByTagName('comment_num');
	$hash{'comment_num'} = $comment_num->getFirstChild->getNodeValue;

	my ($mylist_counter) = $doc->getElementsByTagName('mylist_counter');
	$hash{'mylist_counter'} = $mylist_counter->getFirstChild->getNodeValue;

	my ($tags) = $doc->getElementsByTagName('tags');
	$hash{'tags'} = undef;
	if ($tags->hasChildNodes) {
		my @tag_list;
		foreach my $tag ($tags->getChildNodes) {
			next if ($tag->getNodeName eq '#text');
			next unless $tag->hasChildNodes;
			push @tag_list, $tag->getFirstChild->getNodeValue;
		}
		$hash{'tags'} = \@tag_list;
	}
	push @infos, \%hash;
	usleep(2);
}
return @infos;

DB書き込み

準備とループ
# 先に
# my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "",
#                       { RaiseError => 1, AutoCommit => 0});
# しておく

foreach my $video_info (@video_infos) {
	my %info = %{$video_info};

	my $status = proc_status($dbh, $info{'status'});

	$info{'video_id'} =~ m/(..)(\d+)/;
	my $prefix = $1;
	my $video_id = $2;
	$prefix = proc_prefix($dbh, $prefix);

	if ($status ne "1") {
			proc_video($dbh, $video_id, 0, 0, 0);
			next;
	}

	my ($year, $month, $day, $hour, $min, $sec) =
	    $info{'first_retrieve'} =~ m/^(\d+)\-(\d+)\-(\d+)T(\d+):(\d+):(\d+)/;
	$month = $1 if ($month =~ m/^0(\d)$/);
	$day = $1 if ($day =~ m/^0(\d)$/);
	$hour = $1 if ($hour =~ m/^0(\d)$/);
	$min = $1 if ($min =~ m/^0(\d)$/);
	$sec = $1 if ($sec =~ m/^0(\d)$/);
	my $first_retrieve = timelocal($sec, $min, $hour, $day, $month-1, $year-1900);

	my @duration = $info{'length'} =~ /(\d+)/g;
	my $length = $duration[-1] + ($duration[-2] * 60) + ($duration[-3] * 60 * 60);

	if ($status=~/[02]/) {
			proc_video($dbh, $video_id, $prefix, 0, 0, 0);
	} else {
			proc_video(
					$dbh,
					$video_id,
					$prefix,
					$info{'title'},
					$first_retrieve,
					$length
			);
	}

	proc_video_comment($dbh, $video_id, $info{'description'});

	($year, $month, $day, $hour, $min, $sec) =
	    $info{'get_time'} =~ m/^(\d+)\-(\d+)\-(\d+)T(\d+):(\d+):(\d+)/;
	my $get_date = timelocal($sec, $min, $hour, $day, $month-1, $year-1900);

	my $video_info_id =
	   proc_get_video_info(
			   $dbh, $video_id, $status,
			   $get_date, $info{'view_counter'},
			   $info{'comment_num'}, $info{'mylist_counter'}
	   );

	my @tag_ids = proc_tag($dbh, \@{$info{'tags'}});
	proc_video_tag_info(
			$dbh, $video_id, $video_info_id, \@tag_ids
	);

}
$dbh->commit;
print "DB written. waiting.\n";
sleep 3;
TAG
sub proc_tag {
	my ($dbh, $ref_tags) = @_;
	my @tag_ids;
	my @tag_not_found;
	my $sql = "SELECT id FROM tag WHERE tag_name=?";
	my $sth = $dbh->prepare($sql);
	foreach my $tag_name (@{$ref_tags}) {
		last if $tag_name eq "";
		$sth->execute("$tag_name");
		my $tag_id = $sth->fetch;
		if ($tag_id) {
			$tag_id = ${$tag_id}[0];
			push @tag_ids, $tag_id;
		} else {
			push @tag_not_found, $tag_name;
		}
	}
	$sth->finish;

	$sth = $dbh->prepare(
	             "INSERT INTO tag (tag_name) VALUES (?)") || 
	             warn "TAG INSERT prepare FAILD\n";
	foreach (@tag_not_found) {
		$sth->execute("$_") || warn "TAG INSERT execute FAILD\n";
		my $tag_id = $dbh->last_insert_id(undef, undef, "tag", undef);
		push @tag_ids, $tag_id;
	}
	$sth->finish;
	return @tag_ids;
}
VIDEO
sub proc_video {
	my ($dbh, $video_id, $prefix, $title, $first_retrieve, $length) = @_;
	my $sth = $dbh->prepare("SELECT id, prefix FROM video WHERE id=?");
	   $sth->execute($video_id);
	my $vid = $sth->fetch;
	$sth->finish;

	if ($vid eq "") {
		my $sql = "INSERT INTO video".
		       " (id, prefix, title, post_date, video_duration)".
		       "VALUES (?,?,?,?,?)";
		my $sth = $dbh->prepare($sql);
		   $sth->execute(
		                 $video_id, $prefix, "$title",
		                 $first_retrieve, $length
		                );
	}
	$sth->finish;
}
VIDEO_COMMENT
sub proc_video_comment {
	my ($dbh, $video_id, $desc) = @_;
	my $sth = $dbh->prepare("SELECT id FROM video_comment WHERE id=?");
	   $sth->execute($video_id);
	my $vid = $sth->fetch;
	$sth->finish;

	if ($vid eq "") {
		my $sql = "INSERT INTO video_comment".
		       " (video_id, creators_comment)".
		       "VALUES (?,?)";
		my $sth = $dbh->prepare($sql);
		   $sth->execute(
		                 $video_id, "$desc"
		                );
	}
	$sth->finish;
}
GET_VIDEO_INFO
sub proc_get_video_info {
	my ($dbh, $video_id, $status, $get_date, $view, $comment, $list) = @_;
	my $sql = "INSERT INTO get_video_info".
	       " (video_id, status, get_date,".
	       "  play_count, commented_count, mylisted_count)".
	       "VALUES (?,?,?,?,?,?)";
	my $sth = $dbh->prepare($sql);
	$sth->execute(
	              $video_id,$status,
	              $get_date, $view, $comment, $list
	             ) || warn "$sql\n$video_id, $status, $get_date, $view, $comment, $list\n";
	$sth->finish;
	return $dbh->last_insert_id(undef, undef, "get_video_info", undef);
}
VIDEO_TAG_INFO
sub proc_video_tag_info {
	my ($dbh, $video_id, $video_info_id, $tag_ids) = @_;
	my $sql = "INSERT INTO video_tag_info".
	       " (video_id, get_video_info_id, tag_id)".
	       "VALUES (?,?,?)";
	my $sth = $dbh->prepare($sql);
	foreach my $tag_id (@{$tag_ids}) {
		$sth->execute(
		           $video_id, $video_info_id, $tag_id
		      ) || warn "$sql\n$video_id, $video_info_id, $_";
	}
	$sth->finish;
}
PREFIX
sub proc_prefix {
	my ($dbh, $prefix_name) = @_;
	my $sth = $dbh->prepare("SELECT id FROM prefix WHERE prefix_name=?");
	   $sth->execute("$prefix_name");
	my $re = $sth->fetch;
	$sth->finish;

	if ($re eq "") {
		my $sql = "INSERT INTO prefix (prefix_name) VALUES (?)";
		my $sth = $dbh->prepare($sql);
		   $sth->execute($prefix_name);
	} else {
	    return ${$re}[0];
	}
	$sth->finish;
	return $dbh->last_insert_id(undef, undef, "prefix", undef);
}
STATUS
sub proc_status {
	my ($dbh, $status_name) = @_;
	my $sth = $dbh->prepare("SELECT id FROM status WHERE status_name=?");
	   $sth->execute("$status_name");
	my $re = $sth->fetch;
	$sth->finish;

	if ($re eq "") {
		my $sql = "INSERT INTO status (status_name) VALUES (?)";
		my $sth = $dbh->prepare($sql);
		   $sth->execute($status_name);
	} else {
	    return ${$re}[0];
	}
	$sth->finish;
	return $dbh->last_insert_id(undef, undef, "status", undef);
}

*1:最後の「}」直前の処理が終わってから、最初の「{」の次にある処理に移るまで