"""Import Tribune-media XML format -- i.e. the On TV format

On_Product_Overview_062910.pdf describes the overall feed format.

We wind up with multiple files looking like this:

    on_*_lineups_*_yyyymmdd.xml
    on_*_tv_programs_yyyymmdd.xml
    on_*_tv_schedules_yyyymmdd.xml
    on_*_tv_sources_yyyymmdd.xml

where each each includes a region/provider ID:

    * usa
    * can 
    * mx
    * <clientid>

where the lineup names include:

    * cable
    * digital
    * ota
    * satellite

They are generally delivered over FTP.
"""
import logging, os, argparse, json, glob, time
from atxstyle import utctime, standardlog
import gzip
from lxml import etree as ET
from fussy import twrite
import datetime
import pytz
log = logging.getLogger(__name__)
HERE = os.path.dirname(__file__)

def parse_gzipped_xml( filename ):
    """Parse gzip-compressed XML file directly into ETree structures"""
    parser = ET.XMLParser(ns_clean=True)
    if filename.endswith( '.gz' ):
        handle = gzip.open( filename, 'rb' )
    else:
        handle = open( filename, 'r' )
    tree = ET.parse( handle, parser )
    return tree
    

def load_programs( program_file ):
    """Load the program file and process it"""
    tree = parse_gzipped_xml( program_file )
    programs = []
    for incoming in tree.xpath('//program' ):
        program = {
            'tmsid': incoming.get('TMSId'),
            'title': u'',
            'show_title': u'',
            'language': u'',
        }
        for title in incoming.xpath('./titles/title'):
            program['title'] = title.text 
            break 
        for show_title in incoming.xpath('./episodeInfo/title'):
            program['show_title'] = show_title.text
        program['genre'] = '|'.join([
            genre.text for genre in incoming.xpath( './genres/genre' )
        ])
        for language in incoming.xpath('./origAudioLang' ):
            program['language'] = language.text
            break
        if not program.get('show_title'):
            for description in incoming.xpath('./descriptions/desc' ):
                program['show_title'] = description.text
        programs.append( program )
    return programs

def load_stations( channel_file ):
    tree = parse_gzipped_xml( channel_file )
    stations = []
    for service in tree.xpath('//prgSvc' ):
        station = {
            'tmsid': service.get('prgSvcId'),
            'name': u'',
            'short_name': u'',
            'language': u'',
            'timezone': u'',
            'location': u'',
        }
        for name in service.xpath('./name'):
            station['name'] = name.text
        for callsign in service.xpath('./callSign' ):
            station['short_name'] = callsign.text
        languages = []
        for language in service.xpath('./edLangs/edLang'):
            languages.append( language.text )
        station['language'] = u", ".join(languages)
        
        for timezone in service.xpath('./timeZone' ):
            base = timezone.text
            if base.endswith( ' Observing' ):
                base = base[:-len(' Observing')]
            # try to translate to a standard timezone...
            # They are just using human-readable descriptions, not a standard AFAICS
            tz = pytz.timezone( u'US/%s'%base ) # Non-US timezones will crash...
            station['timezone'] = tz
        
        for address in service.xpath('./address' ):
            for city in address.xpath('./city' ):
                station['location'] = city.text 
            for state in address.xpath('./state' ):
                if station['location']:
                    station['location'] = u'%s, %s'%(station['location'],state.text)
                else:
                    station['location'] = state.text 
        stations.append( station )
    return stations

def load_schedules( schedule_file ):
    tree = parse_gzipped_xml( schedule_file )
    schedules = []
    for sched in tree.xpath( '//schedule' ):
        for event in sched.xpath('./event' ):
            year,month,day = [int(x,10) for x in event.get('date').split('-')]
            hour,minute = 0,0
            for time_rec in event.xpath('./times/time'):
                hour,minute = [int(x,10) for x in time_rec.text.split(':')]
            for tv in event.xpath( './tv' ):
                dur = tv.get('dur')
                assert dur.startswith('PT'), dur 
                dur = dur[2:]
                hours,minutes = dur.split('H')
                minutes = minutes.strip('M')
                hours,minutes = [int(x,10) for x in (hours,minutes)]
            schedule = [
                sched.get('prgSvcId'),
                event.get('TMSId'),
                (year, month, day, hour, minute), 
                hours*3600 + minutes*60, 
            ]
            schedules.append( schedule )
    return schedules 

def convert( source_directory ):
    """Convert the latest files in source directory into native format"""
    try:
        program_file = sorted(
            glob.glob( os.path.join( source_directory, 'on_*_*_programs_*.xml*' ))
        )[-1]
        station_file = sorted(
            glob.glob( os.path.join( source_directory, 'on_*_*_sources_*.xml*' ))
        )[-1]
        schedule_file = sorted(
            glob.glob( os.path.join( source_directory, 'on_*_*_schedules_*.xml*' ))
        )[-1]
    except IndexError:
        log.error(
            "Missing expected files, expected one each of on_*_*_<programs, sources, schedules>_*.xml* files, got: %s",
            ", ".join( glob.glob( os.path.join( source_directory, '*' ))),
        )
        raise SystemExit( 2 )

    programs = load_programs( program_file )
    stations = load_stations( station_file )
    schedules = load_schedules( schedule_file )

    # now check cross-references and do cross-dependent resolution
    program_map = dict([
        (program['tmsid'],program)
        for program in programs
    ])
    station_map = dict([
        (station['tmsid'],station)
        for station in stations
    ])
    for schedule in schedules:
        assert schedule[1] in program_map
        assert schedule[0] in station_map
        # convert the schedule's start-time to a real time coordinate...
#        timezone = station_map[schedule[0]]['timezone']
        timezone = utctime.UTC
        (year, month, day, hour, minute) = schedule[2]
        naive = datetime.datetime(*schedule[2])
        schedule[2] = utctime.as_timestamp(timezone.localize(naive))
        schedule[3] = schedule[2] + schedule[3]
    # we don't want to linearize the timezones as objects...
    for station in stations:
        station['timezone'] = str(station['timezone'])
    return {
        'success': True,
        'ts': time.time(),
        'schedules': schedules,
        'stations': [
            [r[key] for key in [
                'tmsid', 'name', 'short_name', 'language', 'timezone', 'location'
            ]]
            for r in stations
        ], 
        'programs': [
            [r[key] for key in [
                'tmsid','title','show_title','genre','language', 
            ]]
            for r in programs
        ], 
    }

def get_options():
    parser = argparse.ArgumentParser(description='Convert Tribune XML files into internal data-server formats')
    parser.add_argument(
        'source', metavar='DIRECTORY', 
        help="Directory from which to import (unpacked with the 3 files present)"
    )
    parser.add_argument(
        '-o','--output', metavar="PATH", 
        help='Output file to write (default stdout)', 
        default=None, 
    )
    return parser


@standardlog.with_debug( 'epgfetch-convert-tribune' )
def main():
    options = get_options().parse_args()
    
    converted = convert( options.source )
    
    linear = json.dumps(converted)
    if options.output:
        twrite.twrite(options.output, linear )
    else:
        print( linear)
#    if options.stations:
#        stations = {
#            'success':True, 
#            'stations': converted['stations'], 
#        }
#        linear = json.dumps(stations)
#        twrite.twrite(options.stations, linear )
#    

if __name__ == "__main__":
    main()
