Package nflvid
[frames] | no frames]

Source Code for Package nflvid

  1  """ 
  2  Introduction 
  3  ============ 
  4  A simple library to download, slice and search NFL game footage on a 
  5  play-by-play basis. 
  6   
  7  This library comes with preloaded play-by-play meta data, which describes the 
  8  start time of each play in the game footage. However, the actual footage does 
  9  not come with this library and is not released by me. This package therefore 
 10  provides utilities to batch download NFL Game Footage from the original source. 
 11   
 12  Once game footage is downloaded, you can use this library to search plays and 
 13  construct a playlist to play in any video player. 
 14  """ 
 15   
 16  import gzip 
 17  import math 
 18  import os 
 19  import os.path as path 
 20  import socket 
 21  import sys 
 22  import tempfile 
 23  import urllib2 
 24   
 25  import bs4 
 26   
 27  import eventlet 
 28  httplib2 = eventlet.import_patched('httplib2') 
 29  import eventlet.green.subprocess as subprocess 
 30   
 31  from nflgame import OrderedDict 
 32   
 33  _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz') 
 34  _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml' 
 35  _coach_url = 'rtmp://neulionms.fcod.llnwd.net/a5306/e1/mp4:' \ 
 36               'u/nfl/nfl/coachtapes/%s/%s_all_1600' 
 37  _coach_url = ( 
 38      'rtmp://neulionms.fcod.llnwd.net', 
 39      'a5306/e1', 
 40      'mp4:u/nfl/nfl/coachtapes/%s/%s_all_1600', 
 41  ) 
 42  _broadcast_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \ 
 43                   '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8' 
 44   
 45  __broadcast_cache = {}  # game eid -> play id -> Play 
 46  __coach_cache = {}  # game eid -> play id -> Play 
 47   
 48   
49 -def _eprint(s):
50 print >> sys.stderr, s
51 52
53 -def broadcast_url(gobj, quality='1600'):
54 """ 55 Returns the HTTP Live Stream URL (an m3u8 file) for the given game 56 and quality. 57 58 Note that this does not work with every game (yet). In particular, 59 URLs vary unpredictably (to me) from game to game. 60 """ 61 month, day = gobj.eid[4:6], gobj.eid[6:8] 62 return _broadcast_url \ 63 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey, 64 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
65 66
67 -def coach_url(gobj):
68 """ 69 Returns the rtmp URL as a triple for the coach footage 70 of the given game. The elemtns of the triple are:: 71 72 (rtmp server, rtmp app name, rtmp playpath) 73 74 Coach video only comes in 1600 quality. 75 """ 76 return ( 77 _coach_url[0], 78 _coach_url[1], 79 _coach_url[2] % (gobj.season(), gobj.gamekey), 80 )
81 82
83 -def footage_full(footage_dir, gobj):
84 """ 85 Returns the path to the full video for a given game inside an nflvid 86 footage directory. 87 88 If the full footage doesn't exist, then None is returned. 89 """ 90 fp = _full_path(footage_dir, gobj) 91 if not os.access(fp, os.R_OK): 92 return None 93 return fp
94 95
96 -def footage_plays(footage_play_dir, gobj):
97 """ 98 Returns a list of all footage broken down by play inside an nflvid 99 footage directory. The list is sorted numerically by play id. 100 101 If no footage breakdown exists for the game provided, then an empty list 102 is returned. 103 """ 104 fp = _play_path(footage_play_dir, gobj) 105 if not os.access(fp, os.R_OK): 106 return [] 107 return sorted(os.listdir(fp), key=lambda s: int(s[0:-4]))
108 109
110 -def footage_play(footage_play_dir, gobj, playid):
111 """ 112 Returns a file path to an existing play slice in the footage play 113 directory for the game and play given. 114 115 If the file for the play is not readable, then None is returned. 116 """ 117 gamedir = _play_path(footage_play_dir, gobj) 118 fp = path.join(gamedir, '%04d.mp4' % int(playid)) 119 if not os.access(fp, os.R_OK): 120 return None 121 return fp
122 123
124 -def _full_path(footage_dir, g):
125 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
126 127
128 -def _play_path(footage_play_dir, g):
129 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
130 131
132 -def _nice_game(gobj):
133 return '(Season: %s, Week: %s, %s)' \ 134 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
135 136
137 -def unsliced_plays(footage_play_dir, gobj, coach=True, dry_run=False):
138 """ 139 Scans the game directory inside footage_play_dir and returns a list 140 of plays that haven't been sliced yet. In particular, a play is only 141 considered sliced if the following file is readable, assuming {playid} 142 is its play id:: 143 144 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 145 146 All plays for the game given that don't fit this criteria will be 147 returned in the list. 148 149 If the list is empty, then all plays for the game have been sliced. 150 Alternatively, None can be returned if there was a problem retrieving 151 the play-by-play meta data. 152 153 If coach is False, then play timings for broadcast footage will be 154 used instead of coach timings. 155 156 If dry_run is True, then only the first 10 plays of the game are 157 sliced. 158 """ 159 ps = plays(gobj, coach) 160 outdir = _play_path(footage_play_dir, gobj) 161 162 unsliced = [] 163 if ps is None: 164 return None 165 for i, p in enumerate(ps.values()): 166 if dry_run and i >= 10: 167 break 168 pid = p.idstr() 169 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK): 170 unsliced.append(p) 171 return unsliced
172 173
174 -def slice(footage_play_dir, full_footage_file, gobj, coach=True, 175 threads=4, dry_run=False):
176 """ 177 Uses ffmpeg to slice the given footage file into play-by-play pieces. 178 The full_footage_file should point to a full game downloaded with 179 nflvid-footage and gobj should be the corresponding nflgame.game.Game 180 object. 181 182 The footage_play_dir is where the pieces will be saved:: 183 184 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 185 186 This function will not duplicate work. If a video file exists for 187 a particular play, then slice will not regenerate it. 188 189 Note that this function uses an eventlet green pool to run multiple 190 ffmpeg instances simultaneously. The maximum number of threads to 191 use is specified by threads. This function only terminates when all 192 threads have finished processing. 193 194 If coach is False, then play timings for broadcast footage will be 195 used instead of coach timings. 196 197 If dry_run is true, then only the first 10 plays of the game are 198 sliced. 199 """ 200 outdir = _play_path(footage_play_dir, gobj) 201 if not os.access(outdir, os.R_OK): 202 os.makedirs(outdir) 203 204 unsliced = unsliced_plays(footage_play_dir, gobj, coach, dry_run) 205 if unsliced is None or len(unsliced) == 0: 206 # Only show an annoying error message if there are no sliced 207 # plays on disk. 208 if not footage_plays(footage_play_dir, gobj): 209 _eprint( 210 'There are no unsliced plays remaining for game %s %s.\n' 211 'If they have not been sliced yet, then the XML play-by-play ' 212 'meta data may not be available or is corrupt.' 213 % (gobj, _nice_game(gobj))) 214 return 215 216 pool = eventlet.greenpool.GreenPool(threads) 217 for p in unsliced: 218 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p, 219 0, True) 220 pool.waitall() 221 222 _eprint('DONE slicing game %s %s' % (gobj.eid, _nice_game(gobj)))
223 224
225 -def artificial_slice(footage_play_dir, gobj, gobj_play):
226 """ 227 Creates a video file that contains a single static image with a 228 textual description of the play. The purpose is to provide some 229 representation of a play even if its video form doesn't exist. 230 (Or more likely, the play-by-play meta data for that play is 231 corrupt.) 232 233 This function requires the use of ImageMagick's convert with 234 pango support. 235 236 Note that gobj_play is an nflgame.game.Play object and not an 237 nflvid.Play object. 238 """ 239 outdir = _play_path(footage_play_dir, gobj) 240 outpath = path.join(outdir, '%04d.mp4' % int(gobj_play.playid)) 241 242 # def _quote(s): 243 # return "'" + s.replace("'", "'\\''") + "'" 244 pango = '<span size="20000" foreground="white">' 245 with tempfile.NamedTemporaryFile(mode='w+', suffix='.png') as tmp: 246 cmd = ['convert', 247 '-size', '640x480', # size of coach footage. configurable? 248 '-background', 'black', 249 'pango:%s%s</span>' % (pango, gobj_play), 250 tmp.name, 251 ] 252 _run_command(cmd) 253 254 cmd = ['ffmpeg', 255 '-f', 'image2', 256 '-loop', '1', 257 '-r:v', '7', 258 '-i', tmp.name, 259 '-pix_fmt', 'yuv420p', 260 '-an', 261 '-t', '10', 262 outpath, 263 ] 264 _run_command(cmd)
265
266 -def slice_play(footage_play_dir, full_footage_file, gobj, play, 267 max_duration=0, cut_scoreboard=True):
268 """ 269 This is just like slice, but it only slices the play provided. 270 In typical cases, slice should be used since it makes sure not 271 to duplicate work. 272 273 This function will not check if the play-by-play directory for 274 gobj has been created. 275 276 max_duration is used to cap the length of a play. This drastically 277 cuts down on the time required to slice a game and the storage 278 requirements of a game at the cost of potentially missing bigger 279 plays. This is particularly useful if you are slicing broadcast 280 footage, where imposing a cap at about 15 seconds can decrease 281 storage and CPU requirements by more than half without missing much. 282 283 When cut_scoreboard is True, the first 3.0 seconds of 284 the play will be clipped to remove the scoreboard view. 285 """ 286 outdir = _play_path(footage_play_dir, gobj) 287 st = play.start 288 outpath = path.join(outdir, '%s.mp4' % play.idstr()) 289 290 et = play.end 291 if et is None: # Probably the last play of the game. 292 et = st.add_seconds(40) 293 if max_duration > 0 and (et.seconds() - st.seconds()) > max_duration: 294 et = st.add_seconds(max_duration) 295 296 if cut_scoreboard: 297 st = st.add_seconds(3.0) 298 299 dr = PlayTime(seconds=et.fractional() - st.fractional()) 300 301 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli) 302 duration = '%02d:%02d:%02d.%d' % (dr.hh, dr.mm, dr.ss, dr.milli) 303 cmd = ['ffmpeg', 304 '-ss', start_time, 305 '-t', duration, 306 '-i', full_footage_file, 307 '-acodec', 'copy', 308 '-vcodec', 'copy', 309 outpath, 310 ] 311 _run_command(cmd)
312 313
314 -def download_broadcast(footage_dir, gobj, quality='1600', dry_run=False):
315 """ 316 Starts an ffmpeg process to download the full broadcast of the given 317 game with the quality provided. The qualities available are: 318 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best. 319 320 The footage will be saved to the following path:: 321 322 footage_dir/{eid}-{gamekey}.mp4 323 324 If footage is already at that path, then a LookupError is raised. 325 326 A full game's worth of footage at a quality of 1600 is about 2GB. 327 """ 328 fp = _full_path(footage_dir, gobj) 329 if os.access(fp, os.R_OK): 330 raise LookupError('Footage path "%s" already exists.' % fp) 331 332 url = broadcast_url(gobj, quality) 333 334 # Let's check to see if the URL exists. We could let ffmpeg catch 335 # the error, but since this is a common error, let's show something 336 # nicer than a bunch of ffmpeg vomit. 337 resp, _ = httplib2.Http().request(url, 'HEAD') 338 if resp['status'] != '200': 339 _eprint('BAD URL (http status %s) for game %s: %s' 340 % (resp['status'], _nice_game(gobj), url)) 341 _eprint('FAILED to download game %s' % _nice_game(gobj)) 342 return 343 344 cmd = ['ffmpeg', 345 '-timeout', '60', 346 '-i', url] 347 if dry_run: 348 cmd += ['-t', '30'] 349 cmd += ['-absf', 'aac_adtstoasc', # no idea. ffmpeg says I need it though. 350 '-acodec', 'copy', 351 '-vcodec', 'copy', 352 fp, 353 ] 354 355 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 356 if not _run_command(cmd): 357 _eprint('FAILED to download game %s' % _nice_game(gobj)) 358 else: 359 _eprint('DONE with game %s %s' % (gobj.eid, _nice_game(gobj)))
360 361
362 -def download_coach(footage_dir, gobj, dry_run=False):
363 """ 364 Starts an rtmpdump process to download the full coach footage of the 365 given game. Currently, the only quality available is 1600. 366 367 The footage will be saved to the following path:: 368 369 footage_dir/{eid}-{gamekey}.mp4 370 371 If footage is already at that path, then a LookupError is raised. 372 373 A full game's worth of footage at a quality of 1600 is about 1GB. 374 """ 375 fp = _full_path(footage_dir, gobj) 376 if os.access(fp, os.R_OK): 377 raise LookupError('Footage path "%s" already exists.' % fp) 378 379 server, app, path = coach_url(gobj) 380 381 cmd = ['rtmpdump', 382 '--rtmp', server, 383 '--app', app, 384 '--playpath', path, 385 '--timeout', '60', 386 ] 387 if dry_run: 388 cmd += ['--stop', '30'] 389 cmd += ['-o', fp] 390 391 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 392 status = _run_command(cmd) 393 if status is None: 394 _eprint('DONE (incomplete) with game %s %s' 395 % (gobj.eid, _nice_game(gobj))) 396 elif not status: 397 _eprint('FAILED to download game %s' % _nice_game(gobj)) 398 else: 399 _eprint('DONE with game %s %s' % (gobj.eid, _nice_game(gobj)))
400 401
402 -def _run_command(cmd):
403 try: 404 p = subprocess.Popen(cmd, 405 stdout=subprocess.PIPE, 406 stderr=subprocess.STDOUT) 407 output = p.communicate()[0].strip() 408 409 if p.returncode > 0: 410 err = subprocess.CalledProcessError(p.returncode, cmd) 411 err.output = output 412 raise err 413 except subprocess.CalledProcessError, e: 414 # A hack for rtmpdump... 415 if e.returncode == 2 and cmd[0] == 'rtmpdump': 416 return None 417 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n'))) 418 _eprint("Could not run '%s' (exit code %d):\n%s" 419 % (' '.join(cmd), e.returncode, indent(e.output))) 420 return False 421 except OSError, e: 422 _eprint("Could not run '%s' (errno: %d): %s" 423 % (' '.join(cmd), e.errno, e.strerror)) 424 return False 425 return True
426 427
428 -def plays(gobj, coach=True):
429 """ 430 Returns an ordered dictionary of all plays for a particular game 431 with timings for the coach footage. If coach is False, then the 432 timings will be for the broadcast footage. 433 434 The game must be a nflgame.game.Game object. 435 436 If there is a problem retrieving the data, None is returned. 437 438 If the game is over, then the XML data is saved to disk. 439 """ 440 if coach: 441 cache = __coach_cache 442 else: 443 cache = __broadcast_cache 444 445 if gobj.game_over() and gobj.eid in cache: 446 return cache[gobj.eid] 447 448 rawxml = _get_xml_data(gobj.eid, gobj.gamekey) 449 ps = _xml_plays(rawxml, coach) 450 if ps is None: 451 return None 452 if len(ps) == 0: 453 _eprint('Could not find timing nodes in XML data, ' 454 'which provide the start time of each play.') 455 return None 456 __broadcast_cache[gobj.eid] = ps 457 458 # Save the XML data to disk if the game is over. 459 fp = _xmlf % (gobj.eid, gobj.gamekey) 460 if gobj.game_over() and not os.access(fp, os.R_OK): 461 try: 462 print >> gzip.open(fp, 'w+'), rawxml, 463 except IOError: 464 _eprint('Could not cache XML data. Please make ' 465 '"%s" writable.' % path.dirname(fp)) 466 return ps
467 468
469 -def play(gobj, playid, coach=True):
470 """ 471 Returns a Play object given a game and a play id with timings for 472 the coach footage. If coach is False, then the timings will be for 473 the broadcast footage. 474 475 The game must be a nflgame.game.Game object. 476 477 If a play with the given id does not exist, None is returned. 478 """ 479 return plays(gobj).get(playid, None)
480 481
482 -class Play (object):
483 """ 484 Represents a single play with meta data that ties it to game footage. 485 The footage_start corresponds to the 'ArchiveTCIN' or 'CATIN', which 486 is when the play starts. Since there is no record of when a play 487 stops, the end is computed by using the start time of the next play. 488 If it's the last play recorded, then the end time is None. 489 490 The play id is the foreign key that maps to play data stored in nflgame. 491 """
492 - def __init__(self, start, end, playid):
493 self.start, self.end, self.playid = start, end, playid
494
495 - def idstr(self):
496 """Returns a string play id padded with zeroes.""" 497 return '%04d' % int(self.playid)
498
499 - def __str__(self):
500 return '(%s, %s, %s)' % (self.playid, self.start, self.end)
501 502
503 -class PlayTime (object):
504 """ 505 Represents a footage time point, in the format HH:MM:SS:MMM where 506 MMM can be either 2 or 3 digits. 507 """
508 - def __init__(self, point=None, seconds=None):
509 """ 510 Construct a PlayTime object given a point in time in the format 511 HH:MM:SS:MMM where MMM can be either 2 or 3 digits. 512 513 Alternatively, seconds can be provided (which may be a float). 514 """ 515 if seconds is not None: 516 milli = int(1000 * (seconds - math.floor(seconds))) 517 518 seconds = int(math.floor(seconds)) 519 hh = seconds / 3600 520 521 seconds -= hh * 3600 522 mm = seconds / 60 523 524 seconds -= mm * 60 525 ss = seconds 526 527 self.hh, self.mm, self.ss, self.milli = hh, mm, ss, milli 528 self.__point = '%02d:%02d:%02d:%03d' % (hh, mm, ss, milli) 529 return 530 531 self.__point = point 532 self.__coach = False 533 534 try: 535 parts = self.__point.split(':') 536 if len(parts[3]) == 3: 537 self.__coach = True 538 parts = map(int, parts) 539 except ValueError: 540 assert False, 'Bad play time format: %s' % self.__point 541 542 if len(parts) != 4: 543 assert False, 'Expected 4 parts but got %d in: %s' \ 544 % (len(parts), self.__point) 545 546 self.hh, self.mm, self.ss, self.milli = parts 547 548 # I believe milliseconds is given in tens of milliseconds 549 # for the ArchiveTCIN node. But the CATIN node (coach timing) 550 # provides regular milliseconds. 551 if not self.__coach: 552 self.milli *= 10
553
554 - def add_seconds(self, seconds):
555 """ 556 Returns a new PlayTime with seconds (int or float) added to self. 557 """ 558 return PlayTime(seconds=self.fractional() + seconds)
559
560 - def seconds(self):
561 """ 562 Returns this time point rounded to the nearest second. 563 """ 564 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 565 if self.milli >= 50: 566 secs += 1 567 return secs
568
569 - def fractional(self):
570 """ 571 Returns this time point as fractional seconds based on milliseconds. 572 """ 573 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 574 secs = (1000 * secs) + self.milli 575 return float(secs) / 1000.0
576
577 - def __cmp__(self, other):
578 return cmp(self.fractional(), other.fractional())
579
580 - def __sub__(self, other):
581 """ 582 Returns the difference rounded to nearest second between 583 two time points. The 'other' time point must take place before the 584 current time point. 585 """ 586 assert other <= self, '%s is not <= than %s' % (other, self) 587 return int(round(self.fractional() - other.fractional()))
588
589 - def __str__(self):
590 return self.__point
591 592
593 -def _xml_plays(data, coach=True):
594 """ 595 Parses the XML raw data given into an ordered dictionary of Play 596 objects corresponding to coach play timings. If coach is set to 597 False, then play timings for the broadcast are retrieved. 598 599 The dictionary is keyed by play id. 600 """ 601 if data is None: 602 return None 603 604 # Load everything into a list first, since we need to look ahead to see 605 # the next play's start time to compute the current play's duration. 606 rows = [] 607 for row in bs4.BeautifulSoup(data).find_all('row'): 608 playid = row.find('id') 609 if not playid: 610 playid = row.get('playid', None) 611 if not playid: 612 continue 613 playid = playid.strip() 614 else: 615 playid = playid.get_text().strip() 616 617 if coach: 618 start = row.find('catin') 619 else: 620 start = row.find('archivetcin') 621 if not start: 622 continue 623 start = PlayTime(start.get_text().strip()) 624 625 # If this start doesn't procede the last start time, skip it. 626 if len(rows) > 0 and start < rows[-1][1]: 627 continue 628 rows.append((playid, start, row)) 629 630 # A predicate for determining whether to ignore a row or not in our final 631 # result set. For example, timeouts take a lot of time but aren't needed 632 # for play-by-play footage. 633 def ignore(row): 634 if 'playdescription' in row.attrs: 635 if row['playdescription'].lower().startswith('timeout'): 636 return True 637 if row['playdescription'].lower().startswith('two-minute'): 638 return True 639 640 # Did we miss anything? 641 if 'preplaybyplay' in row.attrs: 642 if row['preplaybyplay'].lower().startswith('timeout'): 643 return True 644 return False
645 646 d = OrderedDict() 647 for i, (playid, start, row) in enumerate(rows): 648 if ignore(row): 649 continue 650 end = None 651 if i < len(rows) - 1: 652 end = rows[i+1][1] 653 d[playid] = Play(start, end, playid) 654 return d 655 656
657 -def _get_xml_data(eid=None, gamekey=None, fpath=None):
658 """ 659 Returns the XML play data corresponding to the game given. A game must 660 be specified in one of two ways: by providing the eid and gamekey or 661 by providing the file path to a gzipped XML file. 662 663 If the XML data is already on disk, it is read, decompressed and returned. 664 665 Otherwise, the XML data is downloaded from the NFL web site. If the data 666 doesn't exist yet or there was an error, _get_xml_data returns None. 667 """ 668 assert (eid is not None and gamekey is not None) or fpath is not None 669 670 if fpath is not None: 671 return gzip.open(fpath).read() 672 673 fpath = _xmlf % (eid, gamekey) 674 if os.access(fpath, os.R_OK): 675 return gzip.open(fpath).read() 676 try: 677 year = int(eid[0:4]) 678 month = int(eid[4:6]) 679 if month <= 3: 680 year -= 1 681 u = _xml_base_url % (year, gamekey) # The year and the game key. 682 return urllib2.urlopen(u, timeout=10).read() 683 except urllib2.HTTPError, e: 684 _eprint(e) 685 except socket.timeout, e: 686 _eprint(e) 687 return None
688