データベーススクリプト
抜粋版ですが公開します。
もし、ニコニコ動画にアクセスして何かしたいという方は、重そうな時間とか午前四時を避けてください。
それから、タグ検索は重たい処理らしいので、やさしくお願いします。
なお、「アイドルマスター」タグを全件取得すると、メモリ500MB使って、ループの終了と開始(for文)*1に3秒くらいかかるようになります。
全体の流れ
ログインからAPI問い合わせまでは割愛します。LWPでログインしてHTMLを取得、正規表現でビデオIDを収集して、それぞれAPIに問い合わせているだけなので。
一応重複取得を避けるためにチェックを入れてます。
# %duplication_check を作っておく # @list に抜き出したIDを入れておく # 重複してないIDをこれに入れる my @video_id_list; foreach(@list){ unless ($duplication_check{"$_"}) { push (@video_id_list, $_); } $duplication_check{"$_"}++; } my @video_infos = get_video_infos(@video_id_list);
XML解析
サブルーチンの抜粋です。
# In : array of video_id # Out: array of hash of video_info # hash{tags} is ref array of tag text my $ua = LWP::UserAgent->new( keep_alive => 1 ); my $parser = new XML::DOM::Parser; foreach my $video_id (@video_ids) { my %hash; my $res = $ua->get($api_url); unless ($res->is_success) { warn "Wait for API.\n"; sleep 3; $res = $ua->get($api_url); die "can't get api result.\n" unless $res->is_success; } my $doc = $parser->parse($res->content); my($sec, $min, $hour, $day, $mon, $year) = localtime(time); $year += 1900; $mon++; $hash{'get_time'} = "$year-$mon-${day}T$hour:$min:$sec"; $hash{'video_id'} = $video_id; # DELETED, NOT_FOUNDの処理 my ($response) = $doc->getElementsByTagName('nicovideo_thumb_response'); my $api_status = $response->getAttributes->getNamedItem('status')->getNodeValue; if ($api_status eq 'fail') { my ($code) = $doc->getElementsByTagName('code'); $hash{'status'} = $code->getFirstChild->getNodeValue; next; } else { $hash{'status'} = 'ok'; } my ($title) = $doc->getElementsByTagName('title'); if ($title->hasChildNodes) { $hash{'title'} = $title->getFirstChild->getNodeValue; } else { $hash{'title'} = ""; } my ($description) = $doc->getElementsByTagName('description'); if ($description->hasChildNodes) { $hash{'description'} = $description->getFirstChild->getNodeValue || warn "Desc: $video_id cant't get description.\n"; } else { $hash{'description'} = ""; } my ($first_retrieve) = $doc->getElementsByTagName('first_retrieve'); $hash{'first_retrieve'} = $first_retrieve->getFirstChild->getNodeValue; my ($length) = $doc->getElementsByTagName('length'); $hash{'length'} = $length->getFirstChild->getNodeValue; my ($view_counter) = $doc->getElementsByTagName('view_counter'); $hash{'view_counter'} = $view_counter->getFirstChild->getNodeValue; my ($comment_num) = $doc->getElementsByTagName('comment_num'); $hash{'comment_num'} = $comment_num->getFirstChild->getNodeValue; my ($mylist_counter) = $doc->getElementsByTagName('mylist_counter'); $hash{'mylist_counter'} = $mylist_counter->getFirstChild->getNodeValue; my ($tags) = $doc->getElementsByTagName('tags'); $hash{'tags'} = undef; if ($tags->hasChildNodes) { my @tag_list; foreach my $tag ($tags->getChildNodes) { next if ($tag->getNodeName eq '#text'); next unless $tag->hasChildNodes; push @tag_list, $tag->getFirstChild->getNodeValue; } $hash{'tags'} = \@tag_list; } push @infos, \%hash; usleep(2); } return @infos;
DB書き込み
準備とループ
# 先に # my $dbh = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", # { RaiseError => 1, AutoCommit => 0}); # しておく foreach my $video_info (@video_infos) { my %info = %{$video_info}; my $status = proc_status($dbh, $info{'status'}); $info{'video_id'} =~ m/(..)(\d+)/; my $prefix = $1; my $video_id = $2; $prefix = proc_prefix($dbh, $prefix); if ($status ne "1") { proc_video($dbh, $video_id, 0, 0, 0); next; } my ($year, $month, $day, $hour, $min, $sec) = $info{'first_retrieve'} =~ m/^(\d+)\-(\d+)\-(\d+)T(\d+):(\d+):(\d+)/; $month = $1 if ($month =~ m/^0(\d)$/); $day = $1 if ($day =~ m/^0(\d)$/); $hour = $1 if ($hour =~ m/^0(\d)$/); $min = $1 if ($min =~ m/^0(\d)$/); $sec = $1 if ($sec =~ m/^0(\d)$/); my $first_retrieve = timelocal($sec, $min, $hour, $day, $month-1, $year-1900); my @duration = $info{'length'} =~ /(\d+)/g; my $length = $duration[-1] + ($duration[-2] * 60) + ($duration[-3] * 60 * 60); if ($status=~/[02]/) { proc_video($dbh, $video_id, $prefix, 0, 0, 0); } else { proc_video( $dbh, $video_id, $prefix, $info{'title'}, $first_retrieve, $length ); } proc_video_comment($dbh, $video_id, $info{'description'}); ($year, $month, $day, $hour, $min, $sec) = $info{'get_time'} =~ m/^(\d+)\-(\d+)\-(\d+)T(\d+):(\d+):(\d+)/; my $get_date = timelocal($sec, $min, $hour, $day, $month-1, $year-1900); my $video_info_id = proc_get_video_info( $dbh, $video_id, $status, $get_date, $info{'view_counter'}, $info{'comment_num'}, $info{'mylist_counter'} ); my @tag_ids = proc_tag($dbh, \@{$info{'tags'}}); proc_video_tag_info( $dbh, $video_id, $video_info_id, \@tag_ids ); } $dbh->commit; print "DB written. waiting.\n"; sleep 3;
TAG
sub proc_tag { my ($dbh, $ref_tags) = @_; my @tag_ids; my @tag_not_found; my $sql = "SELECT id FROM tag WHERE tag_name=?"; my $sth = $dbh->prepare($sql); foreach my $tag_name (@{$ref_tags}) { last if $tag_name eq ""; $sth->execute("$tag_name"); my $tag_id = $sth->fetch; if ($tag_id) { $tag_id = ${$tag_id}[0]; push @tag_ids, $tag_id; } else { push @tag_not_found, $tag_name; } } $sth->finish; $sth = $dbh->prepare( "INSERT INTO tag (tag_name) VALUES (?)") || warn "TAG INSERT prepare FAILD\n"; foreach (@tag_not_found) { $sth->execute("$_") || warn "TAG INSERT execute FAILD\n"; my $tag_id = $dbh->last_insert_id(undef, undef, "tag", undef); push @tag_ids, $tag_id; } $sth->finish; return @tag_ids; }
VIDEO
sub proc_video { my ($dbh, $video_id, $prefix, $title, $first_retrieve, $length) = @_; my $sth = $dbh->prepare("SELECT id, prefix FROM video WHERE id=?"); $sth->execute($video_id); my $vid = $sth->fetch; $sth->finish; if ($vid eq "") { my $sql = "INSERT INTO video". " (id, prefix, title, post_date, video_duration)". "VALUES (?,?,?,?,?)"; my $sth = $dbh->prepare($sql); $sth->execute( $video_id, $prefix, "$title", $first_retrieve, $length ); } $sth->finish; }
VIDEO_COMMENT
sub proc_video_comment { my ($dbh, $video_id, $desc) = @_; my $sth = $dbh->prepare("SELECT id FROM video_comment WHERE id=?"); $sth->execute($video_id); my $vid = $sth->fetch; $sth->finish; if ($vid eq "") { my $sql = "INSERT INTO video_comment". " (video_id, creators_comment)". "VALUES (?,?)"; my $sth = $dbh->prepare($sql); $sth->execute( $video_id, "$desc" ); } $sth->finish; }
GET_VIDEO_INFO
sub proc_get_video_info { my ($dbh, $video_id, $status, $get_date, $view, $comment, $list) = @_; my $sql = "INSERT INTO get_video_info". " (video_id, status, get_date,". " play_count, commented_count, mylisted_count)". "VALUES (?,?,?,?,?,?)"; my $sth = $dbh->prepare($sql); $sth->execute( $video_id,$status, $get_date, $view, $comment, $list ) || warn "$sql\n$video_id, $status, $get_date, $view, $comment, $list\n"; $sth->finish; return $dbh->last_insert_id(undef, undef, "get_video_info", undef); }
VIDEO_TAG_INFO
sub proc_video_tag_info { my ($dbh, $video_id, $video_info_id, $tag_ids) = @_; my $sql = "INSERT INTO video_tag_info". " (video_id, get_video_info_id, tag_id)". "VALUES (?,?,?)"; my $sth = $dbh->prepare($sql); foreach my $tag_id (@{$tag_ids}) { $sth->execute( $video_id, $video_info_id, $tag_id ) || warn "$sql\n$video_id, $video_info_id, $_"; } $sth->finish; }
PREFIX
sub proc_prefix { my ($dbh, $prefix_name) = @_; my $sth = $dbh->prepare("SELECT id FROM prefix WHERE prefix_name=?"); $sth->execute("$prefix_name"); my $re = $sth->fetch; $sth->finish; if ($re eq "") { my $sql = "INSERT INTO prefix (prefix_name) VALUES (?)"; my $sth = $dbh->prepare($sql); $sth->execute($prefix_name); } else { return ${$re}[0]; } $sth->finish; return $dbh->last_insert_id(undef, undef, "prefix", undef); }
STATUS
sub proc_status { my ($dbh, $status_name) = @_; my $sth = $dbh->prepare("SELECT id FROM status WHERE status_name=?"); $sth->execute("$status_name"); my $re = $sth->fetch; $sth->finish; if ($re eq "") { my $sql = "INSERT INTO status (status_name) VALUES (?)"; my $sth = $dbh->prepare($sql); $sth->execute($status_name); } else { return ${$re}[0]; } $sth->finish; return $dbh->last_insert_id(undef, undef, "status", undef); }
*1:最後の「}」直前の処理が終わってから、最初の「{」の次にある処理に移るまで