1 """
2 Introduction
3 ============
4 A simple library to download, slice and search NFL game footage on a
5 play-by-play basis.
6
7 This library comes with preloaded play-by-play meta data, which describes the
8 start time of each play in the game footage. However, the actual footage does
9 not come with this library and is not released by me. This package therefore
10 provides utilities to batch download NFL Game Footage from the original source.
11
12 Once game footage is downloaded, you can use this library to search plays and
13 construct a playlist to play in any video player.
14 """
15
16 import gzip
17 import math
18 import os
19 import os.path as path
20 import socket
21 import sys
22 import tempfile
23 import urllib2
24
25 import bs4
26
27 import eventlet
28 httplib2 = eventlet.import_patched('httplib2')
29 import eventlet.green.subprocess as subprocess
30
31 from nflgame import OrderedDict
32
33 _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz')
34 _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml'
35 _coach_url = 'rtmp://neulionms.fcod.llnwd.net/a5306/e1/mp4:' \
36 'u/nfl/nfl/coachtapes/%s/%s_all_1600'
37 _coach_url = (
38 'rtmp://neulionms.fcod.llnwd.net',
39 'a5306/e1',
40 'mp4:u/nfl/nfl/coachtapes/%s/%s_all_1600',
41 )
42 _broadcast_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \
43 '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8'
44
45 __broadcast_cache = {}
46 __coach_cache = {}
47
48
50 print >> sys.stderr, s
51
52
54 """
55 Returns the HTTP Live Stream URL (an m3u8 file) for the given game
56 and quality.
57
58 Note that this does not work with every game (yet). In particular,
59 URLs vary unpredictably (to me) from game to game.
60 """
61 month, day = gobj.eid[4:6], gobj.eid[6:8]
62 return _broadcast_url \
63 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey,
64 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
65
66
68 """
69 Returns the rtmp URL as a triple for the coach footage
70 of the given game. The elemtns of the triple are::
71
72 (rtmp server, rtmp app name, rtmp playpath)
73
74 Coach video only comes in 1600 quality.
75 """
76 return (
77 _coach_url[0],
78 _coach_url[1],
79 _coach_url[2] % (gobj.season(), gobj.gamekey),
80 )
81
82
94
95
108
109
122
123
125 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
126
127
129 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
130
131
133 return '(Season: %s, Week: %s, %s)' \
134 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
135
136
137 -def unsliced_plays(footage_play_dir, gobj, coach=True, dry_run=False):
138 """
139 Scans the game directory inside footage_play_dir and returns a list
140 of plays that haven't been sliced yet. In particular, a play is only
141 considered sliced if the following file is readable, assuming {playid}
142 is its play id::
143
144 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4
145
146 All plays for the game given that don't fit this criteria will be
147 returned in the list.
148
149 If the list is empty, then all plays for the game have been sliced.
150 Alternatively, None can be returned if there was a problem retrieving
151 the play-by-play meta data.
152
153 If coach is False, then play timings for broadcast footage will be
154 used instead of coach timings.
155
156 If dry_run is True, then only the first 10 plays of the game are
157 sliced.
158 """
159 ps = plays(gobj, coach)
160 outdir = _play_path(footage_play_dir, gobj)
161
162 unsliced = []
163 if ps is None:
164 return None
165 for i, p in enumerate(ps.values()):
166 if dry_run and i >= 10:
167 break
168 pid = p.idstr()
169 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK):
170 unsliced.append(p)
171 return unsliced
172
173
174 -def slice(footage_play_dir, full_footage_file, gobj, coach=True,
175 threads=4, dry_run=False):
176 """
177 Uses ffmpeg to slice the given footage file into play-by-play pieces.
178 The full_footage_file should point to a full game downloaded with
179 nflvid-footage and gobj should be the corresponding nflgame.game.Game
180 object.
181
182 The footage_play_dir is where the pieces will be saved::
183
184 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4
185
186 This function will not duplicate work. If a video file exists for
187 a particular play, then slice will not regenerate it.
188
189 Note that this function uses an eventlet green pool to run multiple
190 ffmpeg instances simultaneously. The maximum number of threads to
191 use is specified by threads. This function only terminates when all
192 threads have finished processing.
193
194 If coach is False, then play timings for broadcast footage will be
195 used instead of coach timings.
196
197 If dry_run is true, then only the first 10 plays of the game are
198 sliced.
199 """
200 outdir = _play_path(footage_play_dir, gobj)
201 if not os.access(outdir, os.R_OK):
202 os.makedirs(outdir)
203
204 unsliced = unsliced_plays(footage_play_dir, gobj, coach, dry_run)
205 if unsliced is None or len(unsliced) == 0:
206 _eprint(
207 'There are no unsliced plays remaining for game %s %s.\n'
208 'If they have not been sliced yet, then the XML play-by-play '
209 'meta data may not be available or is corrupt.'
210 % (gobj, _nice_game(gobj)))
211 return
212
213 pool = eventlet.greenpool.GreenPool(threads)
214 for p in unsliced:
215 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p,
216 0, True)
217 pool.waitall()
218
219 _eprint('DONE slicing game %s' % _nice_game(gobj))
220
221
223 """
224 Creates a video file that contains a single static image with a
225 textual description of the play. The purpose is to provide some
226 representation of a play even if its video form doesn't exist.
227 (Or more likely, the play-by-play meta data for that play is
228 corrupt.)
229
230 This function requires the use of ImageMagick's convert with
231 pango support.
232
233 Note that gobj_play is an nflgame.game.Play object and not an
234 nflvid.Play object.
235 """
236 outdir = _play_path(footage_play_dir, gobj)
237 outpath = path.join(outdir, '%04d.mp4' % int(gobj_play.playid))
238
239
240
241 pango = '<span size="20000" foreground="white">'
242 with tempfile.NamedTemporaryFile(mode='w+', suffix='.png') as tmp:
243 cmd = ['convert',
244 '-size', '640x480',
245 '-background', 'black',
246 'pango:%s%s</span>' % (pango, gobj_play),
247 tmp.name,
248 ]
249 _run_command(cmd)
250
251 cmd = ['ffmpeg',
252 '-f', 'image2',
253 '-loop', '1',
254 '-r:v', '7',
255 '-i', tmp.name,
256 '-pix_fmt', 'yuv420p',
257 '-an',
258 '-t', '10',
259 outpath,
260 ]
261 _run_command(cmd)
262
263 -def slice_play(footage_play_dir, full_footage_file, gobj, play,
264 max_duration=0, cut_scoreboard=True):
265 """
266 This is just like slice, but it only slices the play provided.
267 In typical cases, slice should be used since it makes sure not
268 to duplicate work.
269
270 This function will not check if the play-by-play directory for
271 gobj has been created.
272
273 max_duration is used to cap the length of a play. This drastically
274 cuts down on the time required to slice a game and the storage
275 requirements of a game at the cost of potentially missing bigger
276 plays. This is particularly useful if you are slicing broadcast
277 footage, where imposing a cap at about 15 seconds can decrease
278 storage and CPU requirements by more than half without missing much.
279
280 When cut_scoreboard is True, the first 3.0 seconds of
281 the play will be clipped to remove the scoreboard view.
282 """
283 outdir = _play_path(footage_play_dir, gobj)
284 st = play.start
285 outpath = path.join(outdir, '%s.mp4' % play.idstr())
286
287 et = play.end
288 if et is None:
289 et = st.add_seconds(40)
290 if max_duration > 0 and (et.seconds() - st.seconds()) > max_duration:
291 et = st.add_seconds(max_duration)
292
293 if cut_scoreboard:
294 st = st.add_seconds(3.0)
295
296 dr = PlayTime(seconds=et.fractional() - st.fractional())
297
298 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli)
299 duration = '%02d:%02d:%02d.%d' % (dr.hh, dr.mm, dr.ss, dr.milli)
300 cmd = ['ffmpeg',
301 '-ss', start_time,
302 '-t', duration,
303 '-i', full_footage_file,
304 '-acodec', 'copy',
305 '-vcodec', 'copy',
306 outpath,
307 ]
308 _run_command(cmd)
309
310
312 """
313 Starts an ffmpeg process to download the full broadcast of the given
314 game with the quality provided. The qualities available are:
315 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best.
316
317 The footage will be saved to the following path::
318
319 footage_dir/{eid}-{gamekey}.mp4
320
321 If footage is already at that path, then a LookupError is raised.
322
323 A full game's worth of footage at a quality of 1600 is about 2GB.
324 """
325 fp = _full_path(footage_dir, gobj)
326 if os.access(fp, os.R_OK):
327 raise LookupError('Footage path "%s" already exists.' % fp)
328
329 url = broadcast_url(gobj, quality)
330
331
332
333
334 resp, _ = httplib2.Http().request(url, 'HEAD')
335 if resp['status'] != '200':
336 _eprint('BAD URL (http status %s) for game %s: %s'
337 % (resp['status'], _nice_game(gobj), url))
338 _eprint('FAILED to download game %s' % _nice_game(gobj))
339 return
340
341 cmd = ['ffmpeg',
342 '-timeout', '60',
343 '-i', url]
344 if dry_run:
345 cmd += ['-t', '30']
346 cmd += ['-absf', 'aac_adtstoasc',
347 '-acodec', 'copy',
348 '-vcodec', 'copy',
349 fp,
350 ]
351
352 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj)))
353 if not _run_command(cmd):
354 _eprint('FAILED to download game %s' % _nice_game(gobj))
355 else:
356 _eprint('DONE with game %s' % _nice_game(gobj))
357
358
360 """
361 Starts an rtmpdump process to download the full coach footage of the
362 given game. Currently, the only quality available is 1600.
363
364 The footage will be saved to the following path::
365
366 footage_dir/{eid}-{gamekey}.mp4
367
368 If footage is already at that path, then a LookupError is raised.
369
370 A full game's worth of footage at a quality of 1600 is about 1GB.
371 """
372 fp = _full_path(footage_dir, gobj)
373 if os.access(fp, os.R_OK):
374 raise LookupError('Footage path "%s" already exists.' % fp)
375
376 server, app, path = coach_url(gobj)
377
378 cmd = ['rtmpdump',
379 '--rtmp', server,
380 '--app', app,
381 '--playpath', path,
382 '--timeout', '60',
383 ]
384 if dry_run:
385 cmd += ['--stop', '30']
386 cmd += ['-o', fp]
387
388 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj)))
389 status = _run_command(cmd)
390 if status is None:
391 _eprint('DONE (incomplete) with game %s' % _nice_game(gobj))
392 elif not status:
393 _eprint('FAILED to download game %s' % _nice_game(gobj))
394 else:
395 _eprint('DONE with game %s' % _nice_game(gobj))
396
397
399 try:
400 p = subprocess.Popen(cmd,
401 stdout=subprocess.PIPE,
402 stderr=subprocess.STDOUT)
403 output = p.communicate()[0].strip()
404
405 if p.returncode > 0:
406 err = subprocess.CalledProcessError(p.returncode, cmd)
407 err.output = output
408 raise err
409 except subprocess.CalledProcessError, e:
410
411 if e.returncode == 2 and cmd[0] == 'rtmpdump':
412 return None
413 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n')))
414 _eprint("Could not run '%s' (exit code %d):\n%s"
415 % (' '.join(cmd), e.returncode, indent(e.output)))
416 return False
417 except OSError, e:
418 _eprint("Could not run '%s' (errno: %d): %s"
419 % (' '.join(cmd), e.errno, e.strerror))
420 return False
421 return True
422
423
424 -def plays(gobj, coach=True):
425 """
426 Returns an ordered dictionary of all plays for a particular game
427 with timings for the coach footage. If coach is False, then the
428 timings will be for the broadcast footage.
429
430 The game must be a nflgame.game.Game object.
431
432 If there is a problem retrieving the data, None is returned.
433
434 If the game is over, then the XML data is saved to disk.
435 """
436 if coach:
437 cache = __coach_cache
438 else:
439 cache = __broadcast_cache
440
441 if gobj.game_over() and gobj.eid in cache:
442 return cache[gobj.eid]
443
444 rawxml = _get_xml_data(gobj.eid, gobj.gamekey)
445 ps = _xml_plays(rawxml, coach)
446 if ps is None:
447 return None
448 if len(ps) == 0:
449 _eprint('Could not find timing nodes in XML data, '
450 'which provide the start time of each play.')
451 return None
452 __broadcast_cache[gobj.eid] = ps
453
454
455 fp = _xmlf % (gobj.eid, gobj.gamekey)
456 if gobj.game_over() and not os.access(fp, os.R_OK):
457 try:
458 print >> gzip.open(fp, 'w+'), rawxml,
459 except IOError:
460 _eprint('Could not cache XML data. Please make '
461 '"%s" writable.' % path.dirname(fp))
462 return ps
463
464
465 -def play(gobj, playid, coach=True):
466 """
467 Returns a Play object given a game and a play id with timings for
468 the coach footage. If coach is False, then the timings will be for
469 the broadcast footage.
470
471 The game must be a nflgame.game.Game object.
472
473 If a play with the given id does not exist, None is returned.
474 """
475 return plays(gobj).get(playid, None)
476
477
478 -class Play (object):
479 """
480 Represents a single play with meta data that ties it to game footage.
481 The footage_start corresponds to the 'ArchiveTCIN' or 'CATIN', which
482 is when the play starts. Since there is no record of when a play
483 stops, the end is computed by using the start time of the next play.
484 If it's the last play recorded, then the end time is None.
485
486 The play id is the foreign key that maps to play data stored in nflgame.
487 """
488 - def __init__(self, start, end, playid):
489 self.start, self.end, self.playid = start, end, playid
490
492 """Returns a string play id padded with zeroes."""
493 return '%04d' % int(self.playid)
494
496 return '(%s, %s, %s)' % (self.playid, self.start, self.end)
497
498
500 """
501 Represents a footage time point, in the format HH:MM:SS:MMM where
502 MMM can be either 2 or 3 digits.
503 """
504 - def __init__(self, point=None, seconds=None):
505 """
506 Construct a PlayTime object given a point in time in the format
507 HH:MM:SS:MMM where MMM can be either 2 or 3 digits.
508
509 Alternatively, seconds can be provided (which may be a float).
510 """
511 if seconds is not None:
512 milli = int(1000 * (seconds - math.floor(seconds)))
513
514 seconds = int(math.floor(seconds))
515 hh = seconds / 3600
516
517 seconds -= hh * 3600
518 mm = seconds / 60
519
520 seconds -= mm * 60
521 ss = seconds
522
523 self.hh, self.mm, self.ss, self.milli = hh, mm, ss, milli
524 self.__point = '%02d:%02d:%02d:%03d' % (hh, mm, ss, milli)
525 return
526
527 self.__point = point
528 self.__coach = False
529
530 try:
531 parts = self.__point.split(':')
532 if len(parts[3]) == 3:
533 self.__coach = True
534 parts = map(int, parts)
535 except ValueError:
536 assert False, 'Bad play time format: %s' % self.__point
537
538 if len(parts) != 4:
539 assert False, 'Expected 4 parts but got %d in: %s' \
540 % (len(parts), self.__point)
541
542 self.hh, self.mm, self.ss, self.milli = parts
543
544
545
546
547 if not self.__coach:
548 self.milli *= 10
549
555
557 """
558 Returns this time point rounded to the nearest second.
559 """
560 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss
561 if self.milli >= 50:
562 secs += 1
563 return secs
564
566 """
567 Returns this time point as fractional seconds based on milliseconds.
568 """
569 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss
570 secs = (1000 * secs) + self.milli
571 return float(secs) / 1000.0
572
575
577 """
578 Returns the difference rounded to nearest second between
579 two time points. The 'other' time point must take place before the
580 current time point.
581 """
582 assert other <= self, '%s is not <= than %s' % (other, self)
583 return int(round(self.fractional() - other.fractional()))
584
587
588
590 """
591 Parses the XML raw data given into an ordered dictionary of Play
592 objects corresponding to coach play timings. If coach is set to
593 False, then play timings for the broadcast are retrieved.
594
595 The dictionary is keyed by play id.
596 """
597 if data is None:
598 return None
599
600
601
602 rows = []
603 for row in bs4.BeautifulSoup(data).find_all('row'):
604 playid = row.find('id')
605 if not playid:
606 playid = row.get('playid', None)
607 if not playid:
608 continue
609 playid = playid.strip()
610 else:
611 playid = playid.get_text().strip()
612
613 if coach:
614 start = row.find('catin')
615 else:
616 start = row.find('archivetcin')
617 if not start:
618 continue
619 start = PlayTime(start.get_text().strip())
620
621
622 if len(rows) > 0 and start < rows[-1][1]:
623 continue
624 rows.append((playid, start, row))
625
626
627
628
629 def ignore(row):
630 if 'playdescription' in row.attrs:
631 if row['playdescription'].lower().startswith('timeout'):
632 return True
633 if row['playdescription'].lower().startswith('two-minute'):
634 return True
635
636
637 if 'preplaybyplay' in row.attrs:
638 if row['preplaybyplay'].lower().startswith('timeout'):
639 return True
640 return False
641
642 d = OrderedDict()
643 for i, (playid, start, row) in enumerate(rows):
644 if ignore(row):
645 continue
646 end = None
647 if i < len(rows) - 1:
648 end = rows[i+1][1]
649 d[playid] = Play(start, end, playid)
650 return d
651
652
654 """
655 Returns the XML play data corresponding to the game given. A game must
656 be specified in one of two ways: by providing the eid and gamekey or
657 by providing the file path to a gzipped XML file.
658
659 If the XML data is already on disk, it is read, decompressed and returned.
660
661 Otherwise, the XML data is downloaded from the NFL web site. If the data
662 doesn't exist yet or there was an error, _get_xml_data returns None.
663 """
664 assert (eid is not None and gamekey is not None) or fpath is not None
665
666 if fpath is not None:
667 return gzip.open(fpath).read()
668
669 fpath = _xmlf % (eid, gamekey)
670 if os.access(fpath, os.R_OK):
671 return gzip.open(fpath).read()
672 try:
673 year = int(eid[0:4])
674 month = int(eid[4:6])
675 if month <= 3:
676 year -= 1
677 u = _xml_base_url % (year, gamekey)
678 return urllib2.urlopen(u, timeout=10).read()
679 except urllib2.HTTPError, e:
680 _eprint(e)
681 except socket.timeout, e:
682 _eprint(e)
683 return None
684