Here is my python script for grabbing the latest 1000 comments (the api only allows access to the latest 1000 unfortunately) and then checks them against a regexp for matching agains known racist words. Right now it is just looking for the N word. This script will be one of the inner MAP tasks in a series of Map-Reduce steps.

#!/usr/bin/env python

import sys

import gdata.youtube

import gdata.youtube.service

import re

racist_pattern = re.compile(’.*igger.*’, re.IGNORECASE)

#import pprint

#pp = pprint.PrettyPrinter(indent=4)

yt_service = gdata.youtube.service.YouTubeService()

#yt_service.developer_key = “”     #turns out the developer key isn’t necessary

urlpattern = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=%d&max-results=50′

for line in sys.stdin:

video_id = line.strip()

index = 1

url = urlpattern % (video_id, index)

#print url

comments = []

while url:

if index < 20:

comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)

#comments.extend([ comment.content.text for comment in comment_feed.entry ])

for comment in comment_feed.entry:

if racist_pattern.match(comment.content.text):

print ‘%s\t%s\n’ % (comment.author[0].name.text, comment.content.text)

#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]

url = comment_feed.GetNextLink().href

index += 1

else:

#currently the google youtube gdata api will not support over 1000 comments

url = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=951&max-results=49′ % video_id

comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)

for comment in comment_feed.entry:

if racist_pattern.match(comment.content.text):

print ‘%s\t%s\n’ % (comment.author[0].name.text, comment.content.text)

#comments.extend([ comment.content.text for comment in comment_feed.entry ])

#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]

break

#!/usr/bin/env python
import sys
import gdata.youtube
import gdata.youtube.service
import re
racist_pattern = re.compile(’.*igger.*’, re.IGNORECASE)
#import pprint
#pp = pprint.PrettyPrinter(indent=4)
yt_service = gdata.youtube.service.YouTubeService()
#yt_service.developer_key = “AI39si7MDdkK_3HKW7C-NykJxoCuBYSBk3GfFDdjEG7tHWmNIZKyLgnvLR9sj6D4wss3IXWQ-oIWm_hB29vb7oOFUCMk8OClMQ”
urlpattern = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=%d&max-results=50′
for line in sys.stdin:
video_id = line.strip()
index = 1
url = urlpattern % (video_id, index)
#print url
comments = []
while url:
if index < 20:
comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)
#comments.extend([ comment.content.text for comment in comment_feed.entry ])
for comment in comment_feed.entry:
if racist_pattern.match(comment.content.text):
print ‘%s\t%s\n’ % (comment.author[0].name.text, comment.content.text)
#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]
url = comment_feed.GetNextLink().href
index += 1
else:
#currently the google youtube gdata api will not support over 1000 comments
url = ‘http://gdata.youtube.com/feeds/api/videos/%s/comments?start-index=951&max-results=49′ % video_id
comment_feed = yt_service.GetYouTubeVideoCommentFeed(uri=url)
for comment in comment_feed.entry:
if racist_pattern.match(comment.content.text):
print ‘%s\t%s\n’ % (comment.author[0].name.text, comment.content.text)
#comments.extend([ comment.content.text for comment in comment_feed.entry ])
#print [ 'Author: %s\t Comment: %s\n' % (comment.author[0].name.text, comment.content.text) for comment in comment_feed.entry ]
bre

Map-Reduce, Hadoop, Hadoop Streaming, Python and racism

Add Your Comment

Powered by WP Hashcash