Here is my python script for grabbing the latest 1000 comments (the api only allows access to the latest 1000 unfortunately) and then checks them against a regexp for matching agains known racist words. Right now it is just looking for the N word. This script will be one of the inner MAP tasks in a series of Map-Reduce steps.
#!/usr/bin/env python
import sys
import gdata.youtube
import gdata.youtube.service
import re
racist_pattern = re.compile(’.*igger.*’, re.IGNORECASE)
#import pprint
#pp = pprint.PrettyPrinter(indent=4)
yt_service = gdata.youtube.service.YouTubeService()
#yt_service.developer_key = “” #turns out the developer key isn’t necessary
urlpattern = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=%d&max-results=50′
for line in sys.stdin:
video_id = line.strip()
index = 1
url = urlpattern % (video_id, index)
#print url
comments = []
while url:
if index < 20:
comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)
#comments.extend([ comment.content.text for comment in comment_feed.entry ])
for comment in comment_feed.entry:
if racist_pattern.match(comment.content.text):
print ‘%s\t%s\n’ % (comment.author[0].name.text, comment.content.text)
#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]
url = comment_feed.GetNextLink().href
index += 1
else:
#currently the google youtube gdata api will not support over 1000 comments
url = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=951&max-results=49′ % video_id
comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)
for comment in comment_feed.entry:
if racist_pattern.match(comment.content.text):
print ‘%s\t%s\n’ % (comment.author[0].name.text, comment.content.text)
#comments.extend([ comment.content.text for comment in comment_feed.entry ])
#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]
break
