ニコニコ動画のタグ検索結果をスクレイピング
まだ作りかけだけど、とりあえずデータは取ってこれるようになったので晒す。
# -*- coding: utf-8; -*- import time, urllib import nicoapi from BeautifulSoup import BeautifulSoup def geturl(page): baseurl = 'http://www.nicovideo.jp/tag/' tag = urllib.quote('ミクオリジナル曲') query = { 'page' : page, 'sort' : 'f' } return baseurl + tag + '?' + urllib.urlencode(query) def getopener(email, password): n = nicoapi.nicoapi() return n.login_n(email, password) def gethtmlpage(opener, page): html = opener.open(geturl(page)).read() return html def parse(html, res): soup = BeautifulSoup(html) thumbs = soup('div', {'class':'cmn_thumb_frm'}) counts, videos = [], [] for thumb in thumbs: counts.append(thumb('div', { 'class' : 'cmn_thumb_L' })[0]('strong')[1:]) # 1つめの strong 要素は捨てる videos.append(thumb('div', { 'class' : 'cmn_thumb_R' })[0]) for count, video in zip(counts, videos): plays, comments, mylists = map(lambda x: int(x.string.replace(',', '')), count) uptime = video('strong')[0].string title = video('a', { 'class' : 'video' })[0].string id = video('a', { 'class' : 'video' })[0].attrMap['href'][6:] res.append((id, uptime, title, plays, comments, mylists)) def main(): L = [] opener = getopener('xxxxxx@xxxxxx', 'xxxxxxx') for page in xrange(1, 10): print page, parse(gethtmlpage(opener, page), L) time.sleep(3) print 'end' return L
そのうちクラス化するかも。