1 """
2 Introduction
3 ============
4 A simple library to download, slice and search NFL game footage on a
5 play-by-play basis.
6
7 This library comes with preloaded play-by-play meta data, which describes the
8 start time of each play in the game footage. However, the actual footage does
9 not come with this library and is not released by me. This package therefore
10 provides utilities to batch download NFL Game Footage from the original source.
11
12 Once game footage is downloaded, you can use this library to search plays and
13 construct a playlist to play in any video player.
14 """
15
16 import gzip
17 import math
18 import os
19 import os.path as path
20 import socket
21 import sys
22 import tempfile
23 import urllib2
24
25 import bs4
26
27 import eventlet
28 httplib2 = eventlet.import_patched('httplib2')
29 import eventlet.green.subprocess as subprocess
30
31 from nflgame import OrderedDict
32
33 _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz')
34 _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml'
35 _coach_url = 'rtmp://neulionms.fcod.llnwd.net/a5306/e1/mp4:' \
36 'u/nfl/nfl/coachtapes/%s/%s_all_1600'
37 _coach_url = (
38 'rtmp://neulionms.fcod.llnwd.net',
39 'a5306/e1',
40 'mp4:u/nfl/nfl/coachtapes/%s/%s_all_1600',
41 )
42 _broadcast_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \
43 '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8'
44
45 __broadcast_cache = {}
46 __coach_cache = {}
47
48
50 print >> sys.stderr, s
51
52
54 """
55 Returns the HTTP Live Stream URL (an m3u8 file) for the given game
56 and quality.
57
58 Note that this does not work with every game (yet). In particular,
59 URLs vary unpredictably (to me) from game to game.
60 """
61 month, day = gobj.eid[4:6], gobj.eid[6:8]
62 return _broadcast_url \
63 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey,
64 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
65
66
68 """
69 Returns the rtmp URL as a triple for the coach footage
70 of the given game. The elemtns of the triple are::
71
72 (rtmp server, rtmp app name, rtmp playpath)
73
74 Coach video only comes in 1600 quality.
75 """
76 return (
77 _coach_url[0],
78 _coach_url[1],
79 _coach_url[2] % (gobj.season(), gobj.gamekey),
80 )
81
82
94
95
108
109
122
123
125 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
126
127
129 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
130
131
133 return '(Season: %s, Week: %s, %s)' \
134 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
135
136
137 -def unsliced_plays(footage_play_dir, gobj, coach=True, dry_run=False):
138 """
139 Scans the game directory inside footage_play_dir and returns a list
140 of plays that haven't been sliced yet. In particular, a play is only
141 considered sliced if the following file is readable, assuming {playid}
142 is its play id::
143
144 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4
145
146 All plays for the game given that don't fit this criteria will be
147 returned in the list.
148
149 If the list is empty, then all plays for the game have been sliced.
150 Alternatively, None can be returned if there was a problem retrieving
151 the play-by-play meta data.
152
153 If coach is False, then play timings for broadcast footage will be
154 used instead of coach timings.
155
156 If dry_run is True, then only the first 10 plays of the game are
157 sliced.
158 """
159 ps = plays(gobj, coach)
160 outdir = _play_path(footage_play_dir, gobj)
161
162 unsliced = []
163 if ps is None:
164 return None
165 for i, p in enumerate(ps.values()):
166 if dry_run and i >= 10:
167 break
168 pid = p.idstr()
169 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK):
170 unsliced.append(p)
171 return unsliced
172
173
174 -def slice(footage_play_dir, full_footage_file, gobj, coach=True,
175 threads=4, dry_run=False):
176 """
177 Uses ffmpeg to slice the given footage file into play-by-play pieces.
178 The full_footage_file should point to a full game downloaded with
179 nflvid-footage and gobj should be the corresponding nflgame.game.Game
180 object.
181
182 The footage_play_dir is where the pieces will be saved::
183
184 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4
185
186 This function will not duplicate work. If a video file exists for
187 a particular play, then slice will not regenerate it.
188
189 Note that this function uses an eventlet green pool to run multiple
190 ffmpeg instances simultaneously. The maximum number of threads to
191 use is specified by threads. This function only terminates when all
192 threads have finished processing.
193
194 If coach is False, then play timings for broadcast footage will be
195 used instead of coach timings.
196
197 If dry_run is true, then only the first 10 plays of the game are
198 sliced.
199 """
200 outdir = _play_path(footage_play_dir, gobj)
201 if not os.access(outdir, os.R_OK):
202 os.makedirs(outdir)
203
204 unsliced = unsliced_plays(footage_play_dir, gobj, coach, dry_run)
205 if unsliced is None or len(unsliced) == 0:
206
207
208 if not footage_plays(footage_play_dir, gobj):
209 _eprint(
210 'There are no unsliced plays remaining for game %s %s.\n'
211 'If they have not been sliced yet, then the XML play-by-play '
212 'meta data may not be available or is corrupt.'
213 % (gobj, _nice_game(gobj)))
214 return
215
216 pool = eventlet.greenpool.GreenPool(threads)
217 for p in unsliced:
218 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p,
219 0, True)
220 pool.waitall()
221
222 _eprint('DONE slicing game %s %s' % (gobj.eid, _nice_game(gobj)))
223
224
226 """
227 Creates a video file that contains a single static image with a
228 textual description of the play. The purpose is to provide some
229 representation of a play even if its video form doesn't exist.
230 (Or more likely, the play-by-play meta data for that play is
231 corrupt.)
232
233 This function requires the use of ImageMagick's convert with
234 pango support.
235
236 Note that gobj_play is an nflgame.game.Play object and not an
237 nflvid.Play object.
238 """
239 outdir = _play_path(footage_play_dir, gobj)
240 outpath = path.join(outdir, '%04d.mp4' % int(gobj_play.playid))
241
242
243
244 pango = '<span size="20000" foreground="white">'
245 with tempfile.NamedTemporaryFile(mode='w+', suffix='.png') as tmp:
246 cmd = ['convert',
247 '-size', '640x480',
248 '-background', 'black',
249 'pango:%s%s</span>' % (pango, gobj_play),
250 tmp.name,
251 ]
252 _run_command(cmd)
253
254 cmd = ['ffmpeg',
255 '-f', 'image2',
256 '-loop', '1',
257 '-r:v', '7',
258 '-i', tmp.name,
259 '-pix_fmt', 'yuv420p',
260 '-an',
261 '-t', '10',
262 outpath,
263 ]
264 _run_command(cmd)
265
266 -def slice_play(footage_play_dir, full_footage_file, gobj, play,
267 max_duration=0, cut_scoreboard=True):
268 """
269 This is just like slice, but it only slices the play provided.
270 In typical cases, slice should be used since it makes sure not
271 to duplicate work.
272
273 This function will not check if the play-by-play directory for
274 gobj has been created.
275
276 max_duration is used to cap the length of a play. This drastically
277 cuts down on the time required to slice a game and the storage
278 requirements of a game at the cost of potentially missing bigger
279 plays. This is particularly useful if you are slicing broadcast
280 footage, where imposing a cap at about 15 seconds can decrease
281 storage and CPU requirements by more than half without missing much.
282
283 When cut_scoreboard is True, the first 3.0 seconds of
284 the play will be clipped to remove the scoreboard view.
285 """
286 outdir = _play_path(footage_play_dir, gobj)
287 st = play.start
288 outpath = path.join(outdir, '%s.mp4' % play.idstr())
289
290 et = play.end
291 if et is None:
292 et = st.add_seconds(40)
293 if max_duration > 0 and (et.seconds() - st.seconds()) > max_duration:
294 et = st.add_seconds(max_duration)
295
296 if cut_scoreboard:
297 st = st.add_seconds(3.0)
298
299 dr = PlayTime(seconds=et.fractional() - st.fractional())
300
301 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli)
302 duration = '%02d:%02d:%02d.%d' % (dr.hh, dr.mm, dr.ss, dr.milli)
303 cmd = ['ffmpeg',
304 '-ss', start_time,
305 '-t', duration,
306 '-i', full_footage_file,
307 '-acodec', 'copy',
308 '-vcodec', 'copy',
309 outpath,
310 ]
311 _run_command(cmd)
312
313
315 """
316 Starts an ffmpeg process to download the full broadcast of the given
317 game with the quality provided. The qualities available are:
318 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best.
319
320 The footage will be saved to the following path::
321
322 footage_dir/{eid}-{gamekey}.mp4
323
324 If footage is already at that path, then a LookupError is raised.
325
326 A full game's worth of footage at a quality of 1600 is about 2GB.
327 """
328 fp = _full_path(footage_dir, gobj)
329 if os.access(fp, os.R_OK):
330 raise LookupError('Footage path "%s" already exists.' % fp)
331
332 url = broadcast_url(gobj, quality)
333
334
335
336
337 resp, _ = httplib2.Http().request(url, 'HEAD')
338 if resp['status'] != '200':
339 _eprint('BAD URL (http status %s) for game %s: %s'
340 % (resp['status'], _nice_game(gobj), url))
341 _eprint('FAILED to download game %s' % _nice_game(gobj))
342 return
343
344 cmd = ['ffmpeg',
345 '-timeout', '60',
346 '-i', url]
347 if dry_run:
348 cmd += ['-t', '30']
349 cmd += ['-absf', 'aac_adtstoasc',
350 '-acodec', 'copy',
351 '-vcodec', 'copy',
352 fp,
353 ]
354
355 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj)))
356 if not _run_command(cmd):
357 _eprint('FAILED to download game %s' % _nice_game(gobj))
358 else:
359 _eprint('DONE with game %s %s' % (gobj.eid, _nice_game(gobj)))
360
361
363 """
364 Starts an rtmpdump process to download the full coach footage of the
365 given game. Currently, the only quality available is 1600.
366
367 The footage will be saved to the following path::
368
369 footage_dir/{eid}-{gamekey}.mp4
370
371 If footage is already at that path, then a LookupError is raised.
372
373 A full game's worth of footage at a quality of 1600 is about 1GB.
374 """
375 fp = _full_path(footage_dir, gobj)
376 if os.access(fp, os.R_OK):
377 raise LookupError('Footage path "%s" already exists.' % fp)
378
379 server, app, path = coach_url(gobj)
380
381 cmd = ['rtmpdump',
382 '--rtmp', server,
383 '--app', app,
384 '--playpath', path,
385 '--timeout', '60',
386 ]
387 if dry_run:
388 cmd += ['--stop', '30']
389 cmd += ['-o', fp]
390
391 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj)))
392 status = _run_command(cmd)
393 if status is None:
394 _eprint('DONE (incomplete) with game %s %s'
395 % (gobj.eid, _nice_game(gobj)))
396 elif not status:
397 _eprint('FAILED to download game %s' % _nice_game(gobj))
398 else:
399 _eprint('DONE with game %s %s' % (gobj.eid, _nice_game(gobj)))
400
401
403 try:
404 p = subprocess.Popen(cmd,
405 stdout=subprocess.PIPE,
406 stderr=subprocess.STDOUT)
407 output = p.communicate()[0].strip()
408
409 if p.returncode > 0:
410 err = subprocess.CalledProcessError(p.returncode, cmd)
411 err.output = output
412 raise err
413 except subprocess.CalledProcessError, e:
414
415 if e.returncode == 2 and cmd[0] == 'rtmpdump':
416 return None
417 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n')))
418 _eprint("Could not run '%s' (exit code %d):\n%s"
419 % (' '.join(cmd), e.returncode, indent(e.output)))
420 return False
421 except OSError, e:
422 _eprint("Could not run '%s' (errno: %d): %s"
423 % (' '.join(cmd), e.errno, e.strerror))
424 return False
425 return True
426
427
428 -def plays(gobj, coach=True):
429 """
430 Returns an ordered dictionary of all plays for a particular game
431 with timings for the coach footage. If coach is False, then the
432 timings will be for the broadcast footage.
433
434 The game must be a nflgame.game.Game object.
435
436 If there is a problem retrieving the data, None is returned.
437
438 If the game is over, then the XML data is saved to disk.
439 """
440 if coach:
441 cache = __coach_cache
442 else:
443 cache = __broadcast_cache
444
445 if gobj.game_over() and gobj.eid in cache:
446 return cache[gobj.eid]
447
448 rawxml = _get_xml_data(gobj.eid, gobj.gamekey)
449 ps = _xml_plays(rawxml, coach)
450 if ps is None:
451 return None
452 if len(ps) == 0:
453 _eprint('Could not find timing nodes in XML data, '
454 'which provide the start time of each play.')
455 return None
456 __broadcast_cache[gobj.eid] = ps
457
458
459 fp = _xmlf % (gobj.eid, gobj.gamekey)
460 if gobj.game_over() and not os.access(fp, os.R_OK):
461 try:
462 print >> gzip.open(fp, 'w+'), rawxml,
463 except IOError:
464 _eprint('Could not cache XML data. Please make '
465 '"%s" writable.' % path.dirname(fp))
466 return ps
467
468
469 -def play(gobj, playid, coach=True):
470 """
471 Returns a Play object given a game and a play id with timings for
472 the coach footage. If coach is False, then the timings will be for
473 the broadcast footage.
474
475 The game must be a nflgame.game.Game object.
476
477 If a play with the given id does not exist, None is returned.
478 """
479 return plays(gobj).get(playid, None)
480
481
482 -class Play (object):
483 """
484 Represents a single play with meta data that ties it to game footage.
485 The footage_start corresponds to the 'ArchiveTCIN' or 'CATIN', which
486 is when the play starts. Since there is no record of when a play
487 stops, the end is computed by using the start time of the next play.
488 If it's the last play recorded, then the end time is None.
489
490 The play id is the foreign key that maps to play data stored in nflgame.
491 """
492 - def __init__(self, start, end, playid):
493 self.start, self.end, self.playid = start, end, playid
494
496 """Returns a string play id padded with zeroes."""
497 return '%04d' % int(self.playid)
498
500 return '(%s, %s, %s)' % (self.playid, self.start, self.end)
501
502
504 """
505 Represents a footage time point, in the format HH:MM:SS:MMM where
506 MMM can be either 2 or 3 digits.
507 """
508 - def __init__(self, point=None, seconds=None):
509 """
510 Construct a PlayTime object given a point in time in the format
511 HH:MM:SS:MMM where MMM can be either 2 or 3 digits.
512
513 Alternatively, seconds can be provided (which may be a float).
514 """
515 if seconds is not None:
516 milli = int(1000 * (seconds - math.floor(seconds)))
517
518 seconds = int(math.floor(seconds))
519 hh = seconds / 3600
520
521 seconds -= hh * 3600
522 mm = seconds / 60
523
524 seconds -= mm * 60
525 ss = seconds
526
527 self.hh, self.mm, self.ss, self.milli = hh, mm, ss, milli
528 self.__point = '%02d:%02d:%02d:%03d' % (hh, mm, ss, milli)
529 return
530
531 self.__point = point
532 self.__coach = False
533
534 try:
535 parts = self.__point.split(':')
536 if len(parts[3]) == 3:
537 self.__coach = True
538 parts = map(int, parts)
539 except ValueError:
540 assert False, 'Bad play time format: %s' % self.__point
541
542 if len(parts) != 4:
543 assert False, 'Expected 4 parts but got %d in: %s' \
544 % (len(parts), self.__point)
545
546 self.hh, self.mm, self.ss, self.milli = parts
547
548
549
550
551 if not self.__coach:
552 self.milli *= 10
553
559
561 """
562 Returns this time point rounded to the nearest second.
563 """
564 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss
565 if self.milli >= 50:
566 secs += 1
567 return secs
568
570 """
571 Returns this time point as fractional seconds based on milliseconds.
572 """
573 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss
574 secs = (1000 * secs) + self.milli
575 return float(secs) / 1000.0
576
579
581 """
582 Returns the difference rounded to nearest second between
583 two time points. The 'other' time point must take place before the
584 current time point.
585 """
586 assert other <= self, '%s is not <= than %s' % (other, self)
587 return int(round(self.fractional() - other.fractional()))
588
591
592
594 """
595 Parses the XML raw data given into an ordered dictionary of Play
596 objects corresponding to coach play timings. If coach is set to
597 False, then play timings for the broadcast are retrieved.
598
599 The dictionary is keyed by play id.
600 """
601 if data is None:
602 return None
603
604
605
606 rows = []
607 for row in bs4.BeautifulSoup(data).find_all('row'):
608 playid = row.find('id')
609 if not playid:
610 playid = row.get('playid', None)
611 if not playid:
612 continue
613 playid = playid.strip()
614 else:
615 playid = playid.get_text().strip()
616
617 if coach:
618 start = row.find('catin')
619 else:
620 start = row.find('archivetcin')
621 if not start:
622 continue
623 start = PlayTime(start.get_text().strip())
624
625
626 if len(rows) > 0 and start < rows[-1][1]:
627 continue
628 rows.append((playid, start, row))
629
630
631
632
633 def ignore(row):
634 if 'playdescription' in row.attrs:
635 if row['playdescription'].lower().startswith('timeout'):
636 return True
637 if row['playdescription'].lower().startswith('two-minute'):
638 return True
639
640
641 if 'preplaybyplay' in row.attrs:
642 if row['preplaybyplay'].lower().startswith('timeout'):
643 return True
644 return False
645
646 d = OrderedDict()
647 for i, (playid, start, row) in enumerate(rows):
648 if ignore(row):
649 continue
650 end = None
651 if i < len(rows) - 1:
652 end = rows[i+1][1]
653 d[playid] = Play(start, end, playid)
654 return d
655
656
658 """
659 Returns the XML play data corresponding to the game given. A game must
660 be specified in one of two ways: by providing the eid and gamekey or
661 by providing the file path to a gzipped XML file.
662
663 If the XML data is already on disk, it is read, decompressed and returned.
664
665 Otherwise, the XML data is downloaded from the NFL web site. If the data
666 doesn't exist yet or there was an error, _get_xml_data returns None.
667 """
668 assert (eid is not None and gamekey is not None) or fpath is not None
669
670 if fpath is not None:
671 return gzip.open(fpath).read()
672
673 fpath = _xmlf % (eid, gamekey)
674 if os.access(fpath, os.R_OK):
675 return gzip.open(fpath).read()
676 try:
677 year = int(eid[0:4])
678 month = int(eid[4:6])
679 if month <= 3:
680 year -= 1
681 u = _xml_base_url % (year, gamekey)
682 return urllib2.urlopen(u, timeout=10).read()
683 except urllib2.HTTPError, e:
684 _eprint(e)
685 except socket.timeout, e:
686 _eprint(e)
687 return None
688