Package nflvid
[frames] | no frames]

Source Code for Package nflvid

  1  """ 
  2  Introduction 
  3  ============ 
  4  A simple library to download, slice and search NFL game footage on a 
  5  play-by-play basis. 
  6   
  7  This library comes with preloaded play-by-play meta data, which describes the 
  8  start time of each play in the game footage. However, the actual footage does 
  9  not come with this library and is not released by me. This package therefore 
 10  provides utilities to batch download NFL Game Footage from the original source. 
 11   
 12  Once game footage is downloaded, you can use this library to search plays and 
 13  construct a playlist to play in any video player. 
 14  """ 
 15   
 16  import gzip 
 17  import math 
 18  import os 
 19  import os.path as path 
 20  import socket 
 21  import sys 
 22  import urllib2 
 23   
 24  import bs4 
 25   
 26  import eventlet 
 27  httplib2 = eventlet.import_patched('httplib2') 
 28  import eventlet.green.subprocess as subprocess 
 29   
 30  from nflgame import OrderedDict 
 31   
 32  _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz') 
 33  _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml' 
 34  _coach_url = 'rtmp://neulionms.fcod.llnwd.net/a5306/e1/mp4:' \ 
 35               'u/nfl/nfl/coachtapes/%s/%s_all_1600' 
 36  _coach_url = ( 
 37      'rtmp://neulionms.fcod.llnwd.net', 
 38      'a5306/e1', 
 39      'mp4:u/nfl/nfl/coachtapes/%s/%s_all_1600', 
 40  ) 
 41  _broadcast_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \ 
 42                   '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8' 
 43   
 44  __broadcast_cache = {}  # game eid -> play id -> Play 
 45  __coach_cache = {}  # game eid -> play id -> Play 
 46   
 47   
48 -def _eprint(s):
49 print >> sys.stderr, s
50 51
52 -def broadcast_url(gobj, quality='1600'):
53 """ 54 Returns the HTTP Live Stream URL (an m3u8 file) for the given game 55 and quality. 56 57 Note that this does not work with every game (yet). In particular, 58 URLs vary unpredictably (to me) from game to game. 59 """ 60 month, day = gobj.eid[4:6], gobj.eid[6:8] 61 return _broadcast_url \ 62 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey, 63 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
64 65
66 -def coach_url(gobj):
67 """ 68 Returns the rtmp URL as a triple for the coach footage 69 of the given game. The elemtns of the triple are:: 70 71 (rtmp server, rtmp app name, rtmp playpath) 72 73 Coach video only comes in 1600 quality. 74 """ 75 return ( 76 _coach_url[0], 77 _coach_url[1], 78 _coach_url[2] % (gobj.season(), gobj.gamekey), 79 )
80 81
82 -def footage_full(footage_dir, gobj):
83 """ 84 Returns the path to the full video for a given game inside an nflvid 85 footage directory. 86 87 If the full footage doesn't exist, then None is returned. 88 """ 89 fp = _full_path(footage_dir, gobj) 90 if not os.access(fp, os.R_OK): 91 return None 92 return fp
93 94
95 -def footage_plays(footage_play_dir, gobj):
96 """ 97 Returns a list of all footage broken down by play inside an nflvid 98 footage directory. The list is sorted numerically by play id. 99 100 If no footage breakdown exists for the game provided, then an empty list 101 is returned. 102 """ 103 fp = _play_path(footage_play_dir, gobj) 104 if not os.access(fp, os.R_OK): 105 return [] 106 return sorted(os.listdir(fp), key=lambda s: int(s[0:-4]))
107 108
109 -def footage_play(footage_play_dir, gobj, playid):
110 """ 111 Returns a file path to an existing play slice in the footage play 112 directory for the game and play given. 113 114 If the file for the play is not readable, then None is returned. 115 """ 116 gamedir = _play_path(footage_play_dir, gobj) 117 fp = path.join(gamedir, '%04d.mp4' % int(playid)) 118 if not os.access(fp, os.R_OK): 119 return None 120 return fp
121 122
123 -def _full_path(footage_dir, g):
124 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
125 126
127 -def _play_path(footage_play_dir, g):
128 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
129 130
131 -def _nice_game(gobj):
132 return '(Season: %s, Week: %s, %s)' \ 133 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
134 135
136 -def unsliced_plays(footage_play_dir, gobj, coach=True, dry_run=False):
137 """ 138 Scans the game directory inside footage_play_dir and returns a list 139 of plays that haven't been sliced yet. In particular, a play is only 140 considered sliced if the following file is readable, assuming {playid} 141 is its play id:: 142 143 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 144 145 All plays for the game given that don't fit this criteria will be 146 returned in the list. 147 148 If the list is empty, then all plays for the game have been sliced. 149 Alternatively, None can be returned if there was a problem retrieving 150 the play-by-play meta data. 151 152 If coach is False, then play timings for broadcast footage will be 153 used instead of coach timings. 154 155 If dry_run is True, then only the first 10 plays of the game are 156 sliced. 157 """ 158 ps = plays(gobj, coach) 159 outdir = _play_path(footage_play_dir, gobj) 160 161 unsliced = [] 162 if ps is None: 163 return None 164 for i, p in enumerate(ps.values()): 165 if dry_run and i >= 10: 166 break 167 pid = p.idstr() 168 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK): 169 unsliced.append(p) 170 return unsliced
171 172
173 -def slice(footage_play_dir, full_footage_file, gobj, coach=True, 174 threads=4, dry_run=False):
175 """ 176 Uses ffmpeg to slice the given footage file into play-by-play pieces. 177 The full_footage_file should point to a full game downloaded with 178 nflvid-footage and gobj should be the corresponding nflgame.game.Game 179 object. 180 181 The footage_play_dir is where the pieces will be saved:: 182 183 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 184 185 This function will not duplicate work. If a video file exists for 186 a particular play, then slice will not regenerate it. 187 188 Note that this function uses an eventlet green pool to run multiple 189 ffmpeg instances simultaneously. The maximum number of threads to 190 use is specified by threads. This function only terminates when all 191 threads have finished processing. 192 193 If coach is False, then play timings for broadcast footage will be 194 used instead of coach timings. 195 196 If dry_run is true, then only the first 10 plays of the game are 197 sliced. 198 """ 199 outdir = _play_path(footage_play_dir, gobj) 200 if not os.access(outdir, os.R_OK): 201 os.makedirs(outdir) 202 203 unsliced = unsliced_plays(footage_play_dir, gobj, coach, dry_run) 204 if unsliced is None or len(unsliced) == 0: 205 _eprint( 206 'There are no unsliced plays remaining for game %s %s.\n' 207 'If they have not been sliced yet, then the XML play-by-play ' 208 'meta data may not be available or is corrupt.' 209 % (gobj, _nice_game(gobj))) 210 return 211 212 pool = eventlet.greenpool.GreenPool(threads) 213 for p in unsliced: 214 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p, 215 0, True) 216 pool.waitall() 217 218 _eprint('DONE slicing game %s' % _nice_game(gobj))
219 220
221 -def slice_play(footage_play_dir, full_footage_file, gobj, play, 222 max_duration=0, cut_scoreboard=True):
223 """ 224 This is just like slice, but it only slices the play provided. 225 In typical cases, slice should be used since it makes sure not 226 to duplicate work. 227 228 This function will not check if the play-by-play directory for 229 gobj has been created. 230 231 max_duration is used to cap the length of a play. This drastically 232 cuts down on the time required to slice a game and the storage 233 requirements of a game at the cost of potentially missing bigger 234 plays. This is particularly useful if you are slicing broadcast 235 footage, where imposing a cap at about 15 seconds can decrease 236 storage and CPU requirements by more than half without missing much. 237 238 When cut_scoreboard is True, the first 3.0 seconds of 239 the play will be clipped to remove the scoreboard view. 240 """ 241 outdir = _play_path(footage_play_dir, gobj) 242 st = play.start 243 outpath = path.join(outdir, '%s.mp4' % play.idstr()) 244 245 et = play.end 246 if et is None: # Probably the last play of the game. 247 et = st.add_seconds(40) 248 if max_duration > 0 and (et.seconds() - st.seconds()) > max_duration: 249 et = st.add_seconds(max_duration) 250 251 if cut_scoreboard: 252 st = st.add_seconds(3.0) 253 254 dr = PlayTime(seconds=et.fractional() - st.fractional()) 255 256 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli) 257 duration = '%02d:%02d:%02d.%d' % (dr.hh, dr.mm, dr.ss, dr.milli) 258 cmd = ['ffmpeg', 259 '-ss', start_time, 260 '-t', duration, 261 '-i', full_footage_file, 262 '-acodec', 'copy', 263 '-vcodec', 'copy', 264 outpath, 265 ] 266 _run_command(cmd)
267 268
269 -def download_broadcast(footage_dir, gobj, quality='1600', dry_run=False):
270 """ 271 Starts an ffmpeg process to download the full broadcast of the given 272 game with the quality provided. The qualities available are: 273 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best. 274 275 The footage will be saved to the following path:: 276 277 footage_dir/{eid}-{gamekey}.mp4 278 279 If footage is already at that path, then a LookupError is raised. 280 281 A full game's worth of footage at a quality of 1600 is about 2GB. 282 """ 283 fp = _full_path(footage_dir, gobj) 284 if os.access(fp, os.R_OK): 285 raise LookupError('Footage path "%s" already exists.' % fp) 286 287 url = broadcast_url(gobj, quality) 288 289 # Let's check to see if the URL exists. We could let ffmpeg catch 290 # the error, but since this is a common error, let's show something 291 # nicer than a bunch of ffmpeg vomit. 292 resp, _ = httplib2.Http().request(url, 'HEAD') 293 if resp['status'] != '200': 294 _eprint('BAD URL (http status %s) for game %s: %s' 295 % (resp['status'], _nice_game(gobj), url)) 296 _eprint('FAILED to download game %s' % _nice_game(gobj)) 297 return 298 299 cmd = ['ffmpeg', 300 '-timeout', '60', 301 '-i', url] 302 if dry_run: 303 cmd += ['-t', '30'] 304 cmd += ['-absf', 'aac_adtstoasc', # no idea. ffmpeg says I need it though. 305 '-acodec', 'copy', 306 '-vcodec', 'copy', 307 fp, 308 ] 309 310 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 311 if not _run_command(cmd): 312 _eprint('FAILED to download game %s' % _nice_game(gobj)) 313 else: 314 _eprint('DONE with game %s' % _nice_game(gobj))
315 316
317 -def download_coach(footage_dir, gobj, dry_run=False):
318 """ 319 Starts an rtmpdump process to download the full coach footage of the 320 given game. Currently, the only quality available is 1600. 321 322 The footage will be saved to the following path:: 323 324 footage_dir/{eid}-{gamekey}.mp4 325 326 If footage is already at that path, then a LookupError is raised. 327 328 A full game's worth of footage at a quality of 1600 is about 1GB. 329 """ 330 fp = _full_path(footage_dir, gobj) 331 if os.access(fp, os.R_OK): 332 raise LookupError('Footage path "%s" already exists.' % fp) 333 334 server, app, path = coach_url(gobj) 335 336 cmd = ['rtmpdump', 337 '--rtmp', server, 338 '--app', app, 339 '--playpath', path, 340 '--timeout', '60', 341 ] 342 if dry_run: 343 cmd += ['--stop', '30'] 344 cmd += ['-o', fp] 345 346 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 347 status = _run_command(cmd) 348 if status is None: 349 _eprint('DONE (incomplete) with game %s' % _nice_game(gobj)) 350 elif not status: 351 _eprint('FAILED to download game %s' % _nice_game(gobj)) 352 else: 353 _eprint('DONE with game %s' % _nice_game(gobj))
354 355
356 -def _run_command(cmd):
357 try: 358 p = subprocess.Popen(cmd, 359 stdout=subprocess.PIPE, 360 stderr=subprocess.STDOUT) 361 output = p.communicate()[0].strip() 362 363 if p.returncode > 0: 364 err = subprocess.CalledProcessError(p.returncode, cmd) 365 err.output = output 366 raise err 367 except subprocess.CalledProcessError, e: 368 # A hack for rtmpdump... 369 if e.returncode == 2 and cmd[0] == 'rtmpdump': 370 return None 371 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n'))) 372 _eprint("Could not run '%s' (exit code %d):\n%s" 373 % (' '.join(cmd), e.returncode, indent(e.output))) 374 return False 375 except OSError, e: 376 _eprint("Could not run '%s' (errno: %d): %s" 377 % (' '.join(cmd), e.errno, e.strerror)) 378 return False 379 return True
380 381
382 -def plays(gobj, coach=True):
383 """ 384 Returns an ordered dictionary of all plays for a particular game 385 with timings for the coach footage. If coach is False, then the 386 timings will be for the broadcast footage. 387 388 The game must be a nflgame.game.Game object. 389 390 If there is a problem retrieving the data, None is returned. 391 392 If the game is over, then the XML data is saved to disk. 393 """ 394 if coach: 395 cache = __coach_cache 396 else: 397 cache = __broadcast_cache 398 399 if gobj.game_over() and gobj.eid in cache: 400 return cache[gobj.eid] 401 402 rawxml = _get_xml_data(gobj.eid, gobj.gamekey) 403 ps = _xml_plays(rawxml, coach) 404 if ps is None: 405 return None 406 if len(ps) == 0: 407 _eprint('Could not find timing nodes in XML data, ' 408 'which provide the start time of each play.') 409 return None 410 __broadcast_cache[gobj.eid] = ps 411 412 # Save the XML data to disk if the game is over. 413 fp = _xmlf % (gobj.eid, gobj.gamekey) 414 if gobj.game_over() and not os.access(fp, os.R_OK): 415 try: 416 print >> gzip.open(fp, 'w+'), rawxml, 417 except IOError: 418 _eprint('Could not cache XML data. Please make ' 419 '"%s" writable.' % path.dirname(fp)) 420 return ps
421 422
423 -def play(gobj, playid, coach=True):
424 """ 425 Returns a Play object given a game and a play id with timings for 426 the coach footage. If coach is False, then the timings will be for 427 the broadcast footage. 428 429 The game must be a nflgame.game.Game object. 430 431 If a play with the given id does not exist, None is returned. 432 """ 433 return plays(gobj).get(playid, None)
434 435
436 -class Play (object):
437 """ 438 Represents a single play with meta data that ties it to game footage. 439 The footage_start corresponds to the 'ArchiveTCIN' or 'CATIN', which 440 is when the play starts. Since there is no record of when a play 441 stops, the end is computed by using the start time of the next play. 442 If it's the last play recorded, then the end time is None. 443 444 The play id is the foreign key that maps to play data stored in nflgame. 445 """
446 - def __init__(self, start, end, playid):
447 self.start, self.end, self.playid = start, end, playid
448
449 - def idstr(self):
450 """Returns a string play id padded with zeroes.""" 451 return '%04d' % int(self.playid)
452
453 - def __str__(self):
454 return '(%s, %s, %s)' % (self.playid, self.start, self.end)
455 456
457 -class PlayTime (object):
458 """ 459 Represents a footage time point, in the format HH:MM:SS:MMM where 460 MMM can be either 2 or 3 digits. 461 """
462 - def __init__(self, point=None, seconds=None):
463 """ 464 Construct a PlayTime object given a point in time in the format 465 HH:MM:SS:MMM where MMM can be either 2 or 3 digits. 466 467 Alternatively, seconds can be provided (which may be a float). 468 """ 469 if seconds is not None: 470 milli = int(1000 * (seconds - math.floor(seconds))) 471 472 seconds = int(math.floor(seconds)) 473 hh = seconds / 3600 474 475 seconds -= hh * 3600 476 mm = seconds / 60 477 478 seconds -= mm * 60 479 ss = seconds 480 481 self.hh, self.mm, self.ss, self.milli = hh, mm, ss, milli 482 self.__point = '%02d:%02d:%02d:%03d' % (hh, mm, ss, milli) 483 return 484 485 self.__point = point 486 self.__coach = False 487 488 try: 489 parts = self.__point.split(':') 490 if len(parts[3]) == 3: 491 self.__coach = True 492 parts = map(int, parts) 493 except ValueError: 494 assert False, 'Bad play time format: %s' % self.__point 495 496 if len(parts) != 4: 497 assert False, 'Expected 4 parts but got %d in: %s' \ 498 % (len(parts), self.__point) 499 500 self.hh, self.mm, self.ss, self.milli = parts 501 502 # I believe milliseconds is given in tens of milliseconds 503 # for the ArchiveTCIN node. But the CATIN node (coach timing) 504 # provides regular milliseconds. 505 if not self.__coach: 506 self.milli *= 10
507
508 - def add_seconds(self, seconds):
509 """ 510 Returns a new PlayTime with seconds (int or float) added to self. 511 """ 512 return PlayTime(seconds=self.fractional() + seconds)
513
514 - def seconds(self):
515 """ 516 Returns this time point rounded to the nearest second. 517 """ 518 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 519 if self.milli >= 50: 520 secs += 1 521 return secs
522
523 - def fractional(self):
524 """ 525 Returns this time point as fractional seconds based on milliseconds. 526 """ 527 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 528 secs = (1000 * secs) + self.milli 529 return float(secs) / 1000.0
530
531 - def __cmp__(self, other):
532 return cmp(self.fractional(), other.fractional())
533
534 - def __sub__(self, other):
535 """ 536 Returns the difference rounded to nearest second between 537 two time points. The 'other' time point must take place before the 538 current time point. 539 """ 540 assert other <= self, '%s is not <= than %s' % (other, self) 541 return int(round(self.fractional() - other.fractional()))
542
543 - def __str__(self):
544 return self.__point
545 546
547 -def _xml_plays(data, coach=True):
548 """ 549 Parses the XML raw data given into an ordered dictionary of Play 550 objects corresponding to coach play timings. If coach is set to 551 False, then play timings for the broadcast are retrieved. 552 553 The dictionary is keyed by play id. 554 """ 555 if data is None: 556 return None 557 558 # Load everything into a list first, since we need to look ahead to see 559 # the next play's start time to compute the current play's duration. 560 rows = [] 561 for row in bs4.BeautifulSoup(data).find_all('row'): 562 playid = row.find('id') 563 if not playid: 564 playid = row.get('playid', None) 565 if not playid: 566 continue 567 playid = playid.strip() 568 else: 569 playid = playid.get_text().strip() 570 571 if coach: 572 start = row.find('catin') 573 else: 574 start = row.find('archivetcin') 575 if not start: 576 continue 577 start = PlayTime(start.get_text().strip()) 578 579 # If this start doesn't procede the last start time, skip it. 580 if len(rows) > 0 and start < rows[-1][1]: 581 continue 582 rows.append((playid, start, row)) 583 584 # A predicate for determining whether to ignore a row or not in our final 585 # result set. For example, timeouts take a lot of time but aren't needed 586 # for play-by-play footage. 587 def ignore(row): 588 if 'playdescription' in row.attrs: 589 if row['playdescription'].lower().startswith('timeout'): 590 return True 591 if row['playdescription'].lower().startswith('two-minute'): 592 return True 593 594 # Did we miss anything? 595 if 'preplaybyplay' in row.attrs: 596 if row['preplaybyplay'].lower().startswith('timeout'): 597 return True 598 return False
599 600 d = OrderedDict() 601 for i, (playid, start, row) in enumerate(rows): 602 if ignore(row): 603 continue 604 end = None 605 if i < len(rows) - 1: 606 end = rows[i+1][1] 607 d[playid] = Play(start, end, playid) 608 return d 609 610
611 -def _get_xml_data(eid=None, gamekey=None, fpath=None):
612 """ 613 Returns the XML play data corresponding to the game given. A game must 614 be specified in one of two ways: by providing the eid and gamekey or 615 by providing the file path to a gzipped XML file. 616 617 If the XML data is already on disk, it is read, decompressed and returned. 618 619 Otherwise, the XML data is downloaded from the NFL web site. If the data 620 doesn't exist yet or there was an error, _get_xml_data returns None. 621 """ 622 assert (eid is not None and gamekey is not None) or fpath is not None 623 624 if fpath is not None: 625 return gzip.open(fpath).read() 626 627 fpath = _xmlf % (eid, gamekey) 628 if os.access(fpath, os.R_OK): 629 return gzip.open(fpath).read() 630 try: 631 year = int(eid[0:4]) 632 month = int(eid[4:6]) 633 if month <= 3: 634 year -= 1 635 u = _xml_base_url % (year, gamekey) # The year and the game key. 636 return urllib2.urlopen(u, timeout=10).read() 637 except urllib2.HTTPError, e: 638 _eprint(e) 639 except socket.timeout, e: 640 _eprint(e) 641 return None
642