ニコニコ動画のタグ検索結果をスクレイピング

まだ作りかけだけど、とりあえずデータは取ってこれるようになったので晒す。

# -*- coding: utf-8; -*-
import time, urllib
import nicoapi
from BeautifulSoup import BeautifulSoup

def geturl(page):
    baseurl = 'http://www.nicovideo.jp/tag/'
    tag = urllib.quote('ミクオリジナル曲')
    query = { 'page' : page, 'sort' : 'f' }
    return baseurl + tag + '?' + urllib.urlencode(query)

def getopener(email, password):
    n = nicoapi.nicoapi()
    return n.login_n(email, password)
    
def gethtmlpage(opener, page):
    html = opener.open(geturl(page)).read()
    return html

def parse(html, res):
    soup = BeautifulSoup(html)
    thumbs = soup('div', {'class':'cmn_thumb_frm'})
    
    counts, videos = [], []
    for thumb in thumbs:
        counts.append(thumb('div', { 'class' : 'cmn_thumb_L' })[0]('strong')[1:])  # 1つめの strong 要素は捨てる
        videos.append(thumb('div', { 'class' : 'cmn_thumb_R' })[0])
    
    for count, video in zip(counts, videos):
        plays, comments, mylists = map(lambda x: int(x.string.replace(',', '')), count)
        uptime = video('strong')[0].string
        title = video('a', { 'class' : 'video' })[0].string
        id = video('a', { 'class' : 'video' })[0].attrMap['href'][6:]
        res.append((id, uptime, title, plays, comments, mylists))

def main():
    L = []
    opener = getopener('xxxxxx@xxxxxx', 'xxxxxxx')
    for page in xrange(1, 10):
        print page, 
        parse(gethtmlpage(opener, page), L)
        time.sleep(3)
    print 'end'
    return L

そのうちクラス化するかも。