Package nflvid
[frames] | no frames]

Source Code for Package nflvid

  1  """ 
  2  Introduction 
  3  ============ 
  4  A simple library to download, slice and search NFL game footage on a 
  5  play-by-play basis. 
  6   
  7  This library comes with preloaded play-by-play meta data, which describes the 
  8  start time of each play in the game footage. However, the actual footage does 
  9  not come with this library and is not released by me. This package therefore 
 10  provides utilities to batch download NFL Game Footage from the original source. 
 11   
 12  Once game footage is downloaded, you can use this library to search plays and 
 13  construct a playlist to play in any video player. 
 14  """ 
 15   
 16  import gzip 
 17  import math 
 18  import os 
 19  import os.path as path 
 20  import socket 
 21  import sys 
 22  import tempfile 
 23  import urllib2 
 24   
 25  import bs4 
 26   
 27  import eventlet 
 28  httplib2 = eventlet.import_patched('httplib2') 
 29  import eventlet.green.subprocess as subprocess 
 30   
 31  from nflgame import OrderedDict 
 32   
 33  _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz') 
 34  _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml' 
 35  _coach_url = 'rtmp://neulionms.fcod.llnwd.net/a5306/e1/mp4:' \ 
 36               'u/nfl/nfl/coachtapes/%s/%s_all_1600' 
 37  _coach_url = ( 
 38      'rtmp://neulionms.fcod.llnwd.net', 
 39      'a5306/e1', 
 40      'mp4:u/nfl/nfl/coachtapes/%s/%s_all_1600', 
 41  ) 
 42  _broadcast_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \ 
 43                   '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8' 
 44   
 45  __broadcast_cache = {}  # game eid -> play id -> Play 
 46  __coach_cache = {}  # game eid -> play id -> Play 
 47   
 48   
49 -def _eprint(s):
50 print >> sys.stderr, s
51 52
53 -def broadcast_url(gobj, quality='1600'):
54 """ 55 Returns the HTTP Live Stream URL (an m3u8 file) for the given game 56 and quality. 57 58 Note that this does not work with every game (yet). In particular, 59 URLs vary unpredictably (to me) from game to game. 60 """ 61 month, day = gobj.eid[4:6], gobj.eid[6:8] 62 return _broadcast_url \ 63 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey, 64 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
65 66
67 -def coach_url(gobj):
68 """ 69 Returns the rtmp URL as a triple for the coach footage 70 of the given game. The elemtns of the triple are:: 71 72 (rtmp server, rtmp app name, rtmp playpath) 73 74 Coach video only comes in 1600 quality. 75 """ 76 return ( 77 _coach_url[0], 78 _coach_url[1], 79 _coach_url[2] % (gobj.season(), gobj.gamekey), 80 )
81 82
83 -def footage_full(footage_dir, gobj):
84 """ 85 Returns the path to the full video for a given game inside an nflvid 86 footage directory. 87 88 If the full footage doesn't exist, then None is returned. 89 """ 90 fp = _full_path(footage_dir, gobj) 91 if not os.access(fp, os.R_OK): 92 return None 93 return fp
94 95
96 -def footage_plays(footage_play_dir, gobj):
97 """ 98 Returns a list of all footage broken down by play inside an nflvid 99 footage directory. The list is sorted numerically by play id. 100 101 If no footage breakdown exists for the game provided, then an empty list 102 is returned. 103 """ 104 fp = _play_path(footage_play_dir, gobj) 105 if not os.access(fp, os.R_OK): 106 return [] 107 return sorted(os.listdir(fp), key=lambda s: int(s[0:-4]))
108 109
110 -def footage_play(footage_play_dir, gobj, playid):
111 """ 112 Returns a file path to an existing play slice in the footage play 113 directory for the game and play given. 114 115 If the file for the play is not readable, then None is returned. 116 """ 117 gamedir = _play_path(footage_play_dir, gobj) 118 fp = path.join(gamedir, '%04d.mp4' % int(playid)) 119 if not os.access(fp, os.R_OK): 120 return None 121 return fp
122 123
124 -def _full_path(footage_dir, g):
125 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
126 127
128 -def _play_path(footage_play_dir, g):
129 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
130 131
132 -def _nice_game(gobj):
133 return '(Season: %s, Week: %s, %s)' \ 134 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
135 136
137 -def unsliced_plays(footage_play_dir, gobj, coach=True, dry_run=False):
138 """ 139 Scans the game directory inside footage_play_dir and returns a list 140 of plays that haven't been sliced yet. In particular, a play is only 141 considered sliced if the following file is readable, assuming {playid} 142 is its play id:: 143 144 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 145 146 All plays for the game given that don't fit this criteria will be 147 returned in the list. 148 149 If the list is empty, then all plays for the game have been sliced. 150 Alternatively, None can be returned if there was a problem retrieving 151 the play-by-play meta data. 152 153 If coach is False, then play timings for broadcast footage will be 154 used instead of coach timings. 155 156 If dry_run is True, then only the first 10 plays of the game are 157 sliced. 158 """ 159 ps = plays(gobj, coach) 160 outdir = _play_path(footage_play_dir, gobj) 161 162 unsliced = [] 163 if ps is None: 164 return None 165 for i, p in enumerate(ps.values()): 166 if dry_run and i >= 10: 167 break 168 pid = p.idstr() 169 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK): 170 unsliced.append(p) 171 return unsliced
172 173
174 -def slice(footage_play_dir, full_footage_file, gobj, coach=True, 175 threads=4, dry_run=False):
176 """ 177 Uses ffmpeg to slice the given footage file into play-by-play pieces. 178 The full_footage_file should point to a full game downloaded with 179 nflvid-footage and gobj should be the corresponding nflgame.game.Game 180 object. 181 182 The footage_play_dir is where the pieces will be saved:: 183 184 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 185 186 This function will not duplicate work. If a video file exists for 187 a particular play, then slice will not regenerate it. 188 189 Note that this function uses an eventlet green pool to run multiple 190 ffmpeg instances simultaneously. The maximum number of threads to 191 use is specified by threads. This function only terminates when all 192 threads have finished processing. 193 194 If coach is False, then play timings for broadcast footage will be 195 used instead of coach timings. 196 197 If dry_run is true, then only the first 10 plays of the game are 198 sliced. 199 """ 200 outdir = _play_path(footage_play_dir, gobj) 201 if not os.access(outdir, os.R_OK): 202 os.makedirs(outdir) 203 204 unsliced = unsliced_plays(footage_play_dir, gobj, coach, dry_run) 205 if unsliced is None or len(unsliced) == 0: 206 _eprint( 207 'There are no unsliced plays remaining for game %s %s.\n' 208 'If they have not been sliced yet, then the XML play-by-play ' 209 'meta data may not be available or is corrupt.' 210 % (gobj, _nice_game(gobj))) 211 return 212 213 pool = eventlet.greenpool.GreenPool(threads) 214 for p in unsliced: 215 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p, 216 0, True) 217 pool.waitall() 218 219 _eprint('DONE slicing game %s' % _nice_game(gobj))
220 221
222 -def artificial_slice(footage_play_dir, gobj, gobj_play):
223 """ 224 Creates a video file that contains a single static image with a 225 textual description of the play. The purpose is to provide some 226 representation of a play even if its video form doesn't exist. 227 (Or more likely, the play-by-play meta data for that play is 228 corrupt.) 229 230 This function requires the use of ImageMagick's convert with 231 pango support. 232 233 Note that gobj_play is an nflgame.game.Play object and not an 234 nflvid.Play object. 235 """ 236 outdir = _play_path(footage_play_dir, gobj) 237 outpath = path.join(outdir, '%04d.mp4' % int(gobj_play.playid)) 238 239 # def _quote(s): 240 # return "'" + s.replace("'", "'\\''") + "'" 241 pango = '<span size="20000" foreground="white">' 242 with tempfile.NamedTemporaryFile(mode='w+', suffix='.png') as tmp: 243 cmd = ['convert', 244 '-size', '640x480', # size of coach footage. configurable? 245 '-background', 'black', 246 'pango:%s%s</span>' % (pango, gobj_play), 247 tmp.name, 248 ] 249 _run_command(cmd) 250 251 cmd = ['ffmpeg', 252 '-f', 'image2', 253 '-loop', '1', 254 '-r:v', '7', 255 '-i', tmp.name, 256 '-pix_fmt', 'yuv420p', 257 '-an', 258 '-t', '10', 259 outpath, 260 ] 261 _run_command(cmd)
262
263 -def slice_play(footage_play_dir, full_footage_file, gobj, play, 264 max_duration=0, cut_scoreboard=True):
265 """ 266 This is just like slice, but it only slices the play provided. 267 In typical cases, slice should be used since it makes sure not 268 to duplicate work. 269 270 This function will not check if the play-by-play directory for 271 gobj has been created. 272 273 max_duration is used to cap the length of a play. This drastically 274 cuts down on the time required to slice a game and the storage 275 requirements of a game at the cost of potentially missing bigger 276 plays. This is particularly useful if you are slicing broadcast 277 footage, where imposing a cap at about 15 seconds can decrease 278 storage and CPU requirements by more than half without missing much. 279 280 When cut_scoreboard is True, the first 3.0 seconds of 281 the play will be clipped to remove the scoreboard view. 282 """ 283 outdir = _play_path(footage_play_dir, gobj) 284 st = play.start 285 outpath = path.join(outdir, '%s.mp4' % play.idstr()) 286 287 et = play.end 288 if et is None: # Probably the last play of the game. 289 et = st.add_seconds(40) 290 if max_duration > 0 and (et.seconds() - st.seconds()) > max_duration: 291 et = st.add_seconds(max_duration) 292 293 if cut_scoreboard: 294 st = st.add_seconds(3.0) 295 296 dr = PlayTime(seconds=et.fractional() - st.fractional()) 297 298 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli) 299 duration = '%02d:%02d:%02d.%d' % (dr.hh, dr.mm, dr.ss, dr.milli) 300 cmd = ['ffmpeg', 301 '-ss', start_time, 302 '-t', duration, 303 '-i', full_footage_file, 304 '-acodec', 'copy', 305 '-vcodec', 'copy', 306 outpath, 307 ] 308 _run_command(cmd)
309 310
311 -def download_broadcast(footage_dir, gobj, quality='1600', dry_run=False):
312 """ 313 Starts an ffmpeg process to download the full broadcast of the given 314 game with the quality provided. The qualities available are: 315 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best. 316 317 The footage will be saved to the following path:: 318 319 footage_dir/{eid}-{gamekey}.mp4 320 321 If footage is already at that path, then a LookupError is raised. 322 323 A full game's worth of footage at a quality of 1600 is about 2GB. 324 """ 325 fp = _full_path(footage_dir, gobj) 326 if os.access(fp, os.R_OK): 327 raise LookupError('Footage path "%s" already exists.' % fp) 328 329 url = broadcast_url(gobj, quality) 330 331 # Let's check to see if the URL exists. We could let ffmpeg catch 332 # the error, but since this is a common error, let's show something 333 # nicer than a bunch of ffmpeg vomit. 334 resp, _ = httplib2.Http().request(url, 'HEAD') 335 if resp['status'] != '200': 336 _eprint('BAD URL (http status %s) for game %s: %s' 337 % (resp['status'], _nice_game(gobj), url)) 338 _eprint('FAILED to download game %s' % _nice_game(gobj)) 339 return 340 341 cmd = ['ffmpeg', 342 '-timeout', '60', 343 '-i', url] 344 if dry_run: 345 cmd += ['-t', '30'] 346 cmd += ['-absf', 'aac_adtstoasc', # no idea. ffmpeg says I need it though. 347 '-acodec', 'copy', 348 '-vcodec', 'copy', 349 fp, 350 ] 351 352 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 353 if not _run_command(cmd): 354 _eprint('FAILED to download game %s' % _nice_game(gobj)) 355 else: 356 _eprint('DONE with game %s' % _nice_game(gobj))
357 358
359 -def download_coach(footage_dir, gobj, dry_run=False):
360 """ 361 Starts an rtmpdump process to download the full coach footage of the 362 given game. Currently, the only quality available is 1600. 363 364 The footage will be saved to the following path:: 365 366 footage_dir/{eid}-{gamekey}.mp4 367 368 If footage is already at that path, then a LookupError is raised. 369 370 A full game's worth of footage at a quality of 1600 is about 1GB. 371 """ 372 fp = _full_path(footage_dir, gobj) 373 if os.access(fp, os.R_OK): 374 raise LookupError('Footage path "%s" already exists.' % fp) 375 376 server, app, path = coach_url(gobj) 377 378 cmd = ['rtmpdump', 379 '--rtmp', server, 380 '--app', app, 381 '--playpath', path, 382 '--timeout', '60', 383 ] 384 if dry_run: 385 cmd += ['--stop', '30'] 386 cmd += ['-o', fp] 387 388 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 389 status = _run_command(cmd) 390 if status is None: 391 _eprint('DONE (incomplete) with game %s' % _nice_game(gobj)) 392 elif not status: 393 _eprint('FAILED to download game %s' % _nice_game(gobj)) 394 else: 395 _eprint('DONE with game %s' % _nice_game(gobj))
396 397
398 -def _run_command(cmd):
399 try: 400 p = subprocess.Popen(cmd, 401 stdout=subprocess.PIPE, 402 stderr=subprocess.STDOUT) 403 output = p.communicate()[0].strip() 404 405 if p.returncode > 0: 406 err = subprocess.CalledProcessError(p.returncode, cmd) 407 err.output = output 408 raise err 409 except subprocess.CalledProcessError, e: 410 # A hack for rtmpdump... 411 if e.returncode == 2 and cmd[0] == 'rtmpdump': 412 return None 413 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n'))) 414 _eprint("Could not run '%s' (exit code %d):\n%s" 415 % (' '.join(cmd), e.returncode, indent(e.output))) 416 return False 417 except OSError, e: 418 _eprint("Could not run '%s' (errno: %d): %s" 419 % (' '.join(cmd), e.errno, e.strerror)) 420 return False 421 return True
422 423
424 -def plays(gobj, coach=True):
425 """ 426 Returns an ordered dictionary of all plays for a particular game 427 with timings for the coach footage. If coach is False, then the 428 timings will be for the broadcast footage. 429 430 The game must be a nflgame.game.Game object. 431 432 If there is a problem retrieving the data, None is returned. 433 434 If the game is over, then the XML data is saved to disk. 435 """ 436 if coach: 437 cache = __coach_cache 438 else: 439 cache = __broadcast_cache 440 441 if gobj.game_over() and gobj.eid in cache: 442 return cache[gobj.eid] 443 444 rawxml = _get_xml_data(gobj.eid, gobj.gamekey) 445 ps = _xml_plays(rawxml, coach) 446 if ps is None: 447 return None 448 if len(ps) == 0: 449 _eprint('Could not find timing nodes in XML data, ' 450 'which provide the start time of each play.') 451 return None 452 __broadcast_cache[gobj.eid] = ps 453 454 # Save the XML data to disk if the game is over. 455 fp = _xmlf % (gobj.eid, gobj.gamekey) 456 if gobj.game_over() and not os.access(fp, os.R_OK): 457 try: 458 print >> gzip.open(fp, 'w+'), rawxml, 459 except IOError: 460 _eprint('Could not cache XML data. Please make ' 461 '"%s" writable.' % path.dirname(fp)) 462 return ps
463 464
465 -def play(gobj, playid, coach=True):
466 """ 467 Returns a Play object given a game and a play id with timings for 468 the coach footage. If coach is False, then the timings will be for 469 the broadcast footage. 470 471 The game must be a nflgame.game.Game object. 472 473 If a play with the given id does not exist, None is returned. 474 """ 475 return plays(gobj).get(playid, None)
476 477
478 -class Play (object):
479 """ 480 Represents a single play with meta data that ties it to game footage. 481 The footage_start corresponds to the 'ArchiveTCIN' or 'CATIN', which 482 is when the play starts. Since there is no record of when a play 483 stops, the end is computed by using the start time of the next play. 484 If it's the last play recorded, then the end time is None. 485 486 The play id is the foreign key that maps to play data stored in nflgame. 487 """
488 - def __init__(self, start, end, playid):
489 self.start, self.end, self.playid = start, end, playid
490
491 - def idstr(self):
492 """Returns a string play id padded with zeroes.""" 493 return '%04d' % int(self.playid)
494
495 - def __str__(self):
496 return '(%s, %s, %s)' % (self.playid, self.start, self.end)
497 498
499 -class PlayTime (object):
500 """ 501 Represents a footage time point, in the format HH:MM:SS:MMM where 502 MMM can be either 2 or 3 digits. 503 """
504 - def __init__(self, point=None, seconds=None):
505 """ 506 Construct a PlayTime object given a point in time in the format 507 HH:MM:SS:MMM where MMM can be either 2 or 3 digits. 508 509 Alternatively, seconds can be provided (which may be a float). 510 """ 511 if seconds is not None: 512 milli = int(1000 * (seconds - math.floor(seconds))) 513 514 seconds = int(math.floor(seconds)) 515 hh = seconds / 3600 516 517 seconds -= hh * 3600 518 mm = seconds / 60 519 520 seconds -= mm * 60 521 ss = seconds 522 523 self.hh, self.mm, self.ss, self.milli = hh, mm, ss, milli 524 self.__point = '%02d:%02d:%02d:%03d' % (hh, mm, ss, milli) 525 return 526 527 self.__point = point 528 self.__coach = False 529 530 try: 531 parts = self.__point.split(':') 532 if len(parts[3]) == 3: 533 self.__coach = True 534 parts = map(int, parts) 535 except ValueError: 536 assert False, 'Bad play time format: %s' % self.__point 537 538 if len(parts) != 4: 539 assert False, 'Expected 4 parts but got %d in: %s' \ 540 % (len(parts), self.__point) 541 542 self.hh, self.mm, self.ss, self.milli = parts 543 544 # I believe milliseconds is given in tens of milliseconds 545 # for the ArchiveTCIN node. But the CATIN node (coach timing) 546 # provides regular milliseconds. 547 if not self.__coach: 548 self.milli *= 10
549
550 - def add_seconds(self, seconds):
551 """ 552 Returns a new PlayTime with seconds (int or float) added to self. 553 """ 554 return PlayTime(seconds=self.fractional() + seconds)
555
556 - def seconds(self):
557 """ 558 Returns this time point rounded to the nearest second. 559 """ 560 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 561 if self.milli >= 50: 562 secs += 1 563 return secs
564
565 - def fractional(self):
566 """ 567 Returns this time point as fractional seconds based on milliseconds. 568 """ 569 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 570 secs = (1000 * secs) + self.milli 571 return float(secs) / 1000.0
572
573 - def __cmp__(self, other):
574 return cmp(self.fractional(), other.fractional())
575
576 - def __sub__(self, other):
577 """ 578 Returns the difference rounded to nearest second between 579 two time points. The 'other' time point must take place before the 580 current time point. 581 """ 582 assert other <= self, '%s is not <= than %s' % (other, self) 583 return int(round(self.fractional() - other.fractional()))
584
585 - def __str__(self):
586 return self.__point
587 588
589 -def _xml_plays(data, coach=True):
590 """ 591 Parses the XML raw data given into an ordered dictionary of Play 592 objects corresponding to coach play timings. If coach is set to 593 False, then play timings for the broadcast are retrieved. 594 595 The dictionary is keyed by play id. 596 """ 597 if data is None: 598 return None 599 600 # Load everything into a list first, since we need to look ahead to see 601 # the next play's start time to compute the current play's duration. 602 rows = [] 603 for row in bs4.BeautifulSoup(data).find_all('row'): 604 playid = row.find('id') 605 if not playid: 606 playid = row.get('playid', None) 607 if not playid: 608 continue 609 playid = playid.strip() 610 else: 611 playid = playid.get_text().strip() 612 613 if coach: 614 start = row.find('catin') 615 else: 616 start = row.find('archivetcin') 617 if not start: 618 continue 619 start = PlayTime(start.get_text().strip()) 620 621 # If this start doesn't procede the last start time, skip it. 622 if len(rows) > 0 and start < rows[-1][1]: 623 continue 624 rows.append((playid, start, row)) 625 626 # A predicate for determining whether to ignore a row or not in our final 627 # result set. For example, timeouts take a lot of time but aren't needed 628 # for play-by-play footage. 629 def ignore(row): 630 if 'playdescription' in row.attrs: 631 if row['playdescription'].lower().startswith('timeout'): 632 return True 633 if row['playdescription'].lower().startswith('two-minute'): 634 return True 635 636 # Did we miss anything? 637 if 'preplaybyplay' in row.attrs: 638 if row['preplaybyplay'].lower().startswith('timeout'): 639 return True 640 return False
641 642 d = OrderedDict() 643 for i, (playid, start, row) in enumerate(rows): 644 if ignore(row): 645 continue 646 end = None 647 if i < len(rows) - 1: 648 end = rows[i+1][1] 649 d[playid] = Play(start, end, playid) 650 return d 651 652
653 -def _get_xml_data(eid=None, gamekey=None, fpath=None):
654 """ 655 Returns the XML play data corresponding to the game given. A game must 656 be specified in one of two ways: by providing the eid and gamekey or 657 by providing the file path to a gzipped XML file. 658 659 If the XML data is already on disk, it is read, decompressed and returned. 660 661 Otherwise, the XML data is downloaded from the NFL web site. If the data 662 doesn't exist yet or there was an error, _get_xml_data returns None. 663 """ 664 assert (eid is not None and gamekey is not None) or fpath is not None 665 666 if fpath is not None: 667 return gzip.open(fpath).read() 668 669 fpath = _xmlf % (eid, gamekey) 670 if os.access(fpath, os.R_OK): 671 return gzip.open(fpath).read() 672 try: 673 year = int(eid[0:4]) 674 month = int(eid[4:6]) 675 if month <= 3: 676 year -= 1 677 u = _xml_base_url % (year, gamekey) # The year and the game key. 678 return urllib2.urlopen(u, timeout=10).read() 679 except urllib2.HTTPError, e: 680 _eprint(e) 681 except socket.timeout, e: 682 _eprint(e) 683 return None
684