Package nflvid
[frames] | no frames]

Source Code for Package nflvid

  1  """ 
  2  Introduction 
  3  ============ 
  4  A simple library to download, slice and search NFL game footage on a 
  5  play-by-play basis. 
  6   
  7  This library comes with preloaded play-by-play meta data, which describes the 
  8  start time of each play in the game footage. However, the actual footage does 
  9  not come with this library and is not released by me. This package therefore 
 10  provides utilities to batch download NFL Game Footage from the original source. 
 11   
 12  Once game footage is downloaded, you can use this library to search plays and 
 13  construct a playlist to play in any video player. 
 14  """ 
 15   
 16  import gzip 
 17  import os 
 18  import os.path as path 
 19  import socket 
 20  import sys 
 21  import urllib2 
 22   
 23  import bs4 
 24   
 25  import eventlet 
 26  httplib2 = eventlet.import_patched('httplib2') 
 27  import eventlet.green.subprocess as subprocess 
 28   
 29  from nflgame import OrderedDict 
 30   
 31  _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz') 
 32  _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml' 
 33   
 34  _footage_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \ 
 35                 '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8' 
 36   
 37  __play_cache = {}  # game eid -> play id -> Play 
 38   
 39   
40 -def footage_url(gobj, quality='1600'):
41 month, day = gobj.eid[4:6], gobj.eid[6:8] 42 return _footage_url \ 43 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey, 44 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
45 46
47 -def footage_full(footage_dir, gobj):
48 """ 49 Returns the path to the full video for a given game inside an nflvid 50 footage directory. 51 52 If the full footage doesn't exist, then None is returned. 53 """ 54 fp = _full_path(footage_dir, gobj) 55 if not os.access(fp, os.R_OK): 56 return None 57 return fp
58 59
60 -def footage_plays(footage_play_dir, gobj):
61 """ 62 Returns a list of all footage broken down by play inside an nflvid 63 footage directory. The list is sorted numerically by play id. 64 65 If no footage breakdown exists for the game provided, then an empty list 66 is returned. 67 """ 68 fp = _play_path(footage_play_dir, gobj) 69 if not os.access(fp, os.R_OK): 70 return [] 71 entries = filter(lambda f: f != 'full.mp4', os.listdir(fp)) 72 return sorted(entries, key=lambda s: int(s[0:-4]))
73 74
75 -def _full_path(footage_dir, g):
76 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
77 78
79 -def _play_path(footage_play_dir, g):
80 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
81 82
83 -def _nice_game(gobj):
84 return '(Season: %s, Week: %s, %s)' \ 85 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
86 87
88 -def unsliced_plays(footage_play_dir, gobj, dry_run=False):
89 """ 90 Scans the game directory inside footage_play_dir and returns a list 91 of plays that haven't been sliced yet. In particular, a play is only 92 considered sliced if the following file is readable, assuming {playid} 93 is its play id:: 94 95 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 96 97 All plays for the game given that don't fit this criteria will be 98 returned in the list. 99 100 If the list is empty, then all plays for the game have been sliced. 101 Alternatively, None can be returned if there was a problem retrieving 102 the play-by-play meta data. 103 104 If dry_run is true, then only the first 10 plays of the game are 105 sliced. 106 """ 107 ps = plays(gobj) 108 outdir = _play_path(footage_play_dir, gobj) 109 110 unsliced = [] 111 if ps is None: 112 return None 113 for i, p in enumerate(ps.values()): 114 if dry_run and i >= 10: 115 break 116 pid = p.idstr() 117 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK): 118 unsliced.append(p) 119 return unsliced
120 121
122 -def slice(footage_play_dir, full_footage_file, gobj, threads=4, dry_run=False):
123 """ 124 Uses ffmpeg to slice the given footage file into play-by-play pieces. 125 The full_footage_file should point to a full game downloaded with 126 nflvid-footage and gobj should be the corresponding nflgame.game.Game 127 object. 128 129 The footage_play_dir is where the pieces will be saved:: 130 131 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 132 133 This function will not duplicate work. If a video file exists for 134 a particular play, then slice will not regenerate it. 135 136 Note that this function uses an eventlet green pool to run multiple 137 ffmpeg instances simultaneously. The maximum number of threads to 138 use is specified by threads. This function only terminates when all 139 threads have finished processing. 140 141 If dry_run is true, then only the first 10 plays of the game are 142 sliced. 143 """ 144 outdir = _play_path(footage_play_dir, gobj) 145 if not os.access(outdir, os.R_OK): 146 os.makedirs(outdir) 147 148 pool = eventlet.greenpool.GreenPool(threads) 149 for p in unsliced_plays(footage_play_dir, gobj, dry_run) or []: 150 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p) 151 pool.waitall()
152 153
154 -def slice_play(footage_play_dir, full_footage_file, gobj, play, 155 max_duration=15):
156 """ 157 This is just like slice, but it only slices the play provided. 158 In typical cases, slice should be used since it makes sure not 159 to duplicate work. 160 161 This function will not check if the play-by-play directory for 162 gobj has been created. 163 164 max_duration is used to cap the length of a play. This drastically 165 cuts down on the time required to slice a game and the storage 166 requirements of a game at the cost of potentially missing bigger 167 plays. This may get smarter in the future. Set max_duration to 0 168 to impose no artificial cap. 169 """ 170 outdir = _play_path(footage_play_dir, gobj) 171 st = play.start 172 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli) 173 outpath = path.join(outdir, '%s.mp4' % play.idstr()) 174 175 duration = max_duration 176 if duration == 0 or play.duration < max_duration: 177 duration = play.duration or 40 # Probably the last play of the game. 178 179 cmd = ['ffmpeg', 180 '-ss', start_time, 181 '-i', full_footage_file, 182 '-t', '%d' % duration, 183 '-map', '0', 184 '-strict', '-2', 185 outpath, 186 ] 187 _run_command(cmd)
188 189
190 -def download(footage_dir, gobj, quality='1600', dry_run=False):
191 """ 192 Starts an ffmpeg process to download the full footage of the given 193 game with the quality provided. The qualities available are: 194 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best. 195 196 The footage will be saved to the following path:: 197 198 footage_dir/{eid}-{gamekey}.mp4 199 200 If footage is already at that path, then a LookupError is raised. 201 202 A full game's worth of footage at a quality of 1600 is about 2GB. 203 """ 204 fp = _full_path(footage_dir, gobj) 205 if os.access(fp, os.R_OK): 206 raise LookupError('Footage path "%s" already exists.' % fp) 207 208 url = footage_url(gobj, quality) 209 210 # Let's check to see if the URL exists. We could let ffmpeg catch 211 # the error, but since this is a common error, let's show something 212 # nicer than a bunch of ffmpeg vomit. 213 resp, _ = httplib2.Http().request(url, 'HEAD') 214 if resp['status'] != '200': 215 print >> sys.stderr, 'BAD URL (http status %s) for game %s: %s' \ 216 % (resp['status'], _nice_game(gobj), url) 217 print >> sys.stderr, 'FAILED to download game %s' % _nice_game(gobj) 218 return 219 220 cmd = ['ffmpeg', 221 '-timeout', '60', 222 '-i', url] 223 if dry_run: 224 cmd += ['-t', '30'] 225 cmd += ['-strict', '-2', fp] 226 227 print >> sys.stderr, 'Downloading game %s %s' \ 228 % (gobj.eid, _nice_game(gobj)) 229 if not _run_command(cmd): 230 print >> sys.stderr, 'FAILED to download game %s' % _nice_game(gobj) 231 else: 232 print >> sys.stderr, 'DONE with game %s' % _nice_game(gobj)
233 234
235 -def _run_command(cmd):
236 try: 237 p = subprocess.Popen(cmd, 238 stdout=subprocess.PIPE, 239 stderr=subprocess.STDOUT) 240 output = p.communicate()[0].strip() 241 242 if p.returncode > 0: 243 err = subprocess.CalledProcessError(p.returncode, cmd) 244 err.output = output 245 raise err 246 except subprocess.CalledProcessError, e: 247 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n'))) 248 print >> sys.stderr, "Could not run '%s' (exit code %d):\n%s" \ 249 % (' '.join(cmd), e.returncode, indent(e.output)) 250 return False 251 except OSError, e: 252 print >> sys.stderr, "Could not run '%s' (errno: %d): %s" \ 253 % (' '.join(cmd), e.errno, e.strerror) 254 return False 255 return True
256 257
258 -def plays(gobj):
259 """ 260 Returns an ordered dictionary of all plays for a particular game. 261 262 The game must be a nflgame.game.Game object. 263 264 If there is a problem retrieving the data, None is returned. 265 266 If the game is over, then the XML data is saved to disk. 267 """ 268 if gobj.game_over() and gobj.eid in __play_cache: 269 return __play_cache[gobj.eid] 270 271 rawxml = _get_xml_data((gobj.eid, gobj.gamekey)) 272 ps = _xml_play_data(rawxml) 273 if ps is None: 274 return None 275 if len(ps) == 0: 276 print >> sys.stderr, 'Could not find ArchiveTCIN field in XML data. ' \ 277 'This field provides the start time of each play.' 278 return None 279 __play_cache[gobj.eid] = ps 280 281 # Save the XML data to disk if the game is over. 282 if gobj.game_over(): 283 fp = _xmlf % (gobj.eid, gobj.gamekey) 284 try: 285 print >> gzip.open(fp, 'w+'), rawxml, 286 except IOError: 287 print >> sys.stderr, 'Could not cache XML data. Please make ' \ 288 '"%s" writable.' % path.dirname(fp) 289 return ps
290 291
292 -def play(gobj, playid):
293 """ 294 Returns a Play object given a game and a play id. The game must be 295 a nflgame.game.Game object. 296 297 If a play with the given id does not exist, None is returned. 298 """ 299 return plays(gobj).get(playid, None)
300 301
302 -class Play (object):
303 """ 304 Represents a single play with meta data that ties it to game footage. 305 The footage_start corresponds to the 'ArchiveTCIN', which is when 306 the play starts. Since there is no record of when a play stops, the 307 duration is computed by subtracting the start time from the start 308 time of the next play. If it's the last play recorded, then the 309 duration is None. 310 311 The play id is the foreign key that maps to play data stored in nflgame. 312 """
313 - def __init__(self, start, duration, playid):
314 self.start, self.duration, self.playid = start, duration, playid
315
316 - def idstr(self):
317 """Returns a string play id padded with zeroes.""" 318 return '%04d' % int(self.playid)
319
320 - def __str__(self):
321 return '(%s, %s, %s)' % (self.playid, self.start, self.duration)
322 323
324 -class PlayTime (object):
325 """ 326 Represents a footage time point, in the format HH:MM:SS:MM 327 """
328 - def __init__(self, point):
329 self.point = point 330 331 try: 332 parts = map(int, self.point.split(':')) 333 except ValueError: 334 assert False, 'Bad play time format: %s' % self.point 335 336 if len(parts) != 4: 337 assert False, 'Expected 4 parts but got %d in: %s' \ 338 % (len(parts), self.point) 339 340 self.hh, self.mm, self.ss, self.milli = parts 341 342 # I believe milliseconds is given in tens of milliseconds. 343 self.milli *= 10
344
345 - def seconds(self):
346 """ 347 Returns this time point rounded to the nearest second. 348 """ 349 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 350 if self.milli >= 50: 351 secs += 1 352 return secs
353
354 - def fractional(self):
355 """ 356 Returns this time point as fractional seconds based on milliseconds. 357 """ 358 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 359 secs = (1000 * secs) + self.milli 360 return float(secs) / 1000.0
361
362 - def __cmp__(self, other):
363 return cmp(self.fractional(), other.fractional())
364
365 - def __sub__(self, other):
366 """ 367 Returns the difference rounded to nearest second between 368 two time points. The 'other' time point must take place before the 369 current time point. 370 """ 371 assert other <= self, '%s is not <= than %s' % (other, self) 372 return int(round(self.fractional() - other.fractional()))
373
374 - def __str__(self):
375 return self.point
376 377
378 -def _xml_play_data(data):
379 """ 380 Parses the XML raw data given into an ordered dictionary of Play 381 objects. The dictionary is keyed by play id. 382 """ 383 if data is None: 384 return None 385 386 # Load everything into a list first, since we need to look ahead to see 387 # the next play's start time to compute the current play's duration. 388 rows = [] 389 for row in bs4.BeautifulSoup(data).find_all('row'): 390 playid = row.find('id') 391 if not playid: 392 continue 393 playid = playid.get_text().strip() 394 395 start = row.find('archivetcin') 396 if not start: 397 continue 398 start = PlayTime(start.get_text().strip()) 399 400 # If this start doesn't procede the last start time, skip it. 401 if len(rows) > 0 and start < rows[-1][1]: 402 continue 403 rows.append((playid, start, row)) 404 405 # A predicate for determining whether to ignore a row or not in our final 406 # result set. For example, timeouts take a lot of time but aren't needed 407 # for play-by-play footage. 408 def ignore(row): 409 if 'playdescription' in row.attrs: 410 if row['playdescription'].lower().startswith('timeout'): 411 return True 412 if row['playdescription'].lower().startswith('two-minute'): 413 return True 414 415 # Did we miss anything? 416 if 'preplaybyplay' in row.attrs: 417 if row['preplaybyplay'].lower().startswith('timeout'): 418 return True 419 return False
420 421 d = OrderedDict() 422 for i, (playid, start, row) in enumerate(rows): 423 if ignore(row): 424 continue 425 duration = None 426 if i < len(rows) - 1: 427 duration = rows[i+1][1] - start 428 d[playid] = Play(start, duration, playid) 429 return d 430 431
432 -def _get_xml_data(game=None, fpath=None):
433 """ 434 Returns the XML play data corresponding to the game given. A game must 435 be specified as a tuple: the first element should be an eid and the second 436 element should be a game key. For example, ('2012102108', '55604'). 437 438 If the XML data is already on disk, it is read, decompressed and returned. 439 440 Otherwise, the XML data is downloaded from the NFL web site. If the data 441 doesn't exist yet or there was an error, _get_xml_data returns None. 442 443 If game is None, then the XML data is read from the file at fpath. 444 """ 445 assert game is not None or fpath is not None 446 447 if fpath is not None: 448 return gzip.open(fpath).read() 449 450 fpath = _xmlf % (game[0], game[1]) 451 if os.access(fpath, os.R_OK): 452 return gzip.open(fpath).read() 453 try: 454 year = int(game[0][0:4]) 455 month = int(game[0][4:6]) 456 if month <= 3: 457 year -= 1 458 u = _xml_base_url % (year, game[1]) # The year and the game key. 459 return urllib2.urlopen(u, timeout=10).read() 460 except urllib2.HTTPError, e: 461 print >> sys.stderr, e 462 except socket.timeout, e: 463 print >> sys.stderr, e 464 return None
465