#!/usr/bin/python
__requires__ = 'TurboGears[future]'
import pkg_resources
pkg_resources.require("TurboGears")

import glob
import logging
import re
import optparse
import os
import stat
import string
import sys
import yum.repoMDObject
import datetime
import time
import hashlib
import warnings

from sqlobject import *
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning) 
    import turbogears
from turbogears import config

sys.path.append('/usr/share/mirrormanager/server')
from mirrormanager.model import *
from mirrormanager.repomap import *
from mirrormanager.lib import manage_pidfile, remove_pidfile, run_rsync, parent_dir


pidfile='/var/run/mirrormanager/umdl.pid'
delete_directories=False
logger = None
stdexcludes=['.*\.snapshot', '.*/\.~tmp~']

from turbogears.database import PackageHub
hub = PackageHub("mirrormanager")
__connection__ = hub

def remove_category_topdir(topdirName, path):
    path = path[len(topdirName)+1:]
    return path

def _get_version_from_path(path):
    # Debian/Ubuntu versioning
    s = r'dists/(\w+)/' # this ignores 10.10 and maverick-{anything}, but picks up 'maverick'
    m = re.search(re.compile(s), path)
    if m is not None:
        return m.group(1)
    # Fedora versioning
    s = r'/?(([\.\d]+)(\-\w+)?)/'
    m = re.search(re.compile(s), path)
    if m is not None:
        return m.group(1)
    return None

def create_version_from_path(category, path):
    ver = None
    vname = _get_version_from_path(path)
    if vname is not None and vname != '':
        if u'/test/' in path:
            isTest = True
        else:
            isTest = False
        try:
            ver = Version(product=category.product, name=vname, isTest=isTest)
        except:
            pass

    return ver

arch_cache = None
version_cache = None

def setup_arch_version_cache():
    global arch_cache
    if arch_cache is None:
        arch_cache = []
        for a in list(Arch.select()):
            arch_cache.append(a.sqlmeta.asDict())

    global version_cache
    if version_cache is None:
        version_cache = []
        for v in list(Version.select(orderBy='-id')):
            version_cache.append(v.sqlmeta.asDict())

def guess_ver_arch_from_path(category, path):
    arch = None
    if 'SRPMS' in path:
        arch = Arch.byName('source')
    else:
        for a in arch_cache:
            s = '.*(^|/)%s(/|$).*' % (a['name'])
            if re.compile(s).match(path):
                arch = Arch.get(a['id'])
                break
            

    ver = None
    # newest versions/IDs first, also handles stupid Fedora 9.newkey hack.
    for v in version_cache:
        if v['productID'] != category.product.id: continue
        s = '.*(^|/)%s(/|$).*' % (v['name'])
        if re.compile(s).match(path):
            ver = Version.get(v['id'])
            break

    # create Versions if we can figure it out...
    if ver is None:
        ver = create_version_from_path(category, path)
        if ver:
            version_cache.append(ver.sqlmeta.asDict())
    return (ver, arch)


# Something like this is committed to yum upstream, but may not be in the copy we are using.
def set_repomd_timestamp(yumrepo):
    timestamp = 0
    for ft in yumrepo.fileTypes():
        thisdata = yumrepo.repoData[ft]
        timestamp = max(int(thisdata.timestamp), timestamp)
    yumrepo.timestamp = timestamp
    return timestamp

def make_file_details_from_checksums(diskpath, relativeDName, D):
    def _parse_checksum_file(relativeDName, checksumlen):
        r = {}
        try:
            f = open(os.path.join(diskpath, relativeDName),  'r')
            for line in f:
                line = line.strip()
                s = line.split()
                if len(s) < 2:
                    continue
                if len(s[0]) != checksumlen:
                    continue
                # strip off extraneous starting '*' char from name
                s[1] = s[1].strip('*')
                r[s[1]] = s[0]
            f.close()
        except:
            pass
        return r

    def _checksums_from_globs(relativeDName, globs, checksumlen):
        d = {}
        checksum_files = []
        for g in globs:
            checksum_files.extend(glob.glob(os.path.join(diskpath, relativeDName, g)))
        for f in checksum_files:
            d.update(_parse_checksum_file(f, checksumlen))
        return d

    if diskpath is None: return

    sha1_globs = ['*.sha1sum', 'SHA1SUM', 'sha1sum.txt']
    md5_globs = ['*.md5sum', 'MD5SUM', 'md5sum.txt']
    sha256_globs = ['*-CHECKSUM', 'sha256sum.txt']
    sha512_globs = ['*.sha512sum', 'SHA512SUM', 'sha512sum.txt']
    md5dict = _checksums_from_globs(relativeDName, md5_globs, 32)
    sha1dict = _checksums_from_globs(relativeDName, sha1_globs, 40)
    sha256dict = _checksums_from_globs(relativeDName, sha256_globs, 64)
    sha512dict = _checksums_from_globs(relativeDName, sha512_globs, 128)

    files = set()
    for k in md5dict.keys():
        files.add(k)
    for k in sha1dict.keys():
        files.add(k)
    for k in sha256dict.keys():
        files.add(k)
    for k in sha512dict.keys():
        files.add(k)

    for f in files:
        try:
            s = os.stat(os.path.join(diskpath, relativeDName, f))
        except OSError:
            # bail if the file doesn't actually exist
            continue
        sha1 = sha1dict.get(f)
        md5  = md5dict.get(f)
        sha256  = sha256dict.get(f)
        sha512  = sha512dict.get(f)
        size = s.st_size
        ctime = s[stat.ST_CTIME]
        try:
            fd = FileDetail.selectBy(directory=D, filename=f, sha1=sha1, md5=md5, sha256=sha256, sha512=sha512,
                                     size=size, timestamp=ctime)[0]
        except IndexError:
            fd = FileDetail(directory=D, filename=f, sha1=sha1, md5=md5, sha256=sha256, sha512=sha512,
                            timestamp=ctime, size=size)
    

def make_repomd_file_details(diskpath, relativeDName, D, category):
    if diskpath is None: return
    if not relativeDName.endswith(u'/repodata'): return
    absolutepath = os.path.join(diskpath, relativeDName, 'repomd.xml')
    if not os.path.exists(absolutepath):
        return
    try:
        f = open(absolutepath, 'r')
        repomd = f.read()
        f.close()
    except:
        return
    size = len(repomd)
    md5 = hashlib.md5(repomd).hexdigest()
    sha1 = hashlib.sha1(repomd).hexdigest()
    sha256 = hashlib.sha256(repomd).hexdigest()
    sha512 = hashlib.sha512(repomd).hexdigest()

    yumrepo = yum.repoMDObject.RepoMD('repoid', absolutepath)
    if 'timestamp' not in yumrepo.__dict__:
        set_repomd_timestamp(yumrepo)
    timestamp = yumrepo.timestamp
    try:
        fd = FileDetail.selectBy(directory=D, filename='repomd.xml', sha1=sha1, md5=md5, sha256=sha256, sha512=sha512,
                                 timestamp=timestamp, size=size)[0]
    except IndexError:
        fd = FileDetail(directory=D, filename='repomd.xml', sha1=sha1, md5=md5, sha256=sha256, sha512=sha512,
                        timestamp=timestamp, size=size)


def move_repository_from_development(prefix, arch, newdir):
    try:
        repo = Repository.selectBy(prefix=prefix, arch=arch)[0]
    except:
        return None

    if repo.directory is not None and newdir.name is not None:
        if '/development' in repo.directory.name and '/releases' in newdir.name:
            repo.directory = newdir
    return repo

def make_repository(directory, relativeDName, category):
    repo = None
    (ver, arch) = guess_ver_arch_from_path(category, relativeDName)
    if ver is None or arch is None:
        return None

    # stop making duplicate Repository objects.
    if len(directory.repositories) > 0:
        return None

    prefix = repo_prefix(relativeDName, category, ver)
    try:
	# historically, Repository.name was a longer string with
	# product and category deliniations.  But we were getting
	# unique constraint conflicts once we started introducing
	# repositories under repositories.  And .name isn't used for
	# anything meaningful.  So simply have it match dir.name,
	# which can't conflict.
        repo = Repository(name=directory.name, category=category, version=ver, arch=arch, directory=directory, prefix=prefix)
        logger.info('Created Repository(prefix=%s, version=%s, arch=%s, category=%s) -> Directory %s' % (prefix, ver.name, arch.name, category.name, directory.name))
    except:
        pass

    return repo

def is_excluded(path, excludes):
    for e in excludes:
        if re.compile(e).match(path):
            return True
    return False


def nuke_gone_directories(diskpath, category):
    """ deleting a Directory has a ripple effect through the whole
        database.  Be really sure you're ready do to this.  It comes
        in handy when say a Test release is dropped."""

    topdirName = category.topdir.name

    directories = category.directories # in ascending name order
    directories.reverse() # now in decending name order, bottoms up
    for d in directories:
        if is_excluded(d.name, category.excludes): continue
        relativeDName = remove_category_topdir(topdirName, d.name)
        if not os.path.isdir(os.path.join(diskpath, relativeDName)):
            if len(d.categories) == 1: # safety, this should always trigger
                logger.info("Deleting gone directory %s" % (d.name))
                d.destroySelf()

def ctime_from_rsync(date, hms):
    year, month, day = date.split('/')
    hour, minute, second = hms.split(':')
    t = datetime.datetime(int(year), int(month), int(day), int(hour), int(minute), int(second), 0, None)
    return int(time.mktime(t.timetuple()))


def fill_category_directories_from_rsync(line, category, topdirName, category_directories):
    readable=True
    relativeDName = line.split()[4]
    if re.compile('^\.$').match(d):
        directoryname = topdirName
    else:
        directoryname = os.path.join(topdirName, relativeDName)

    if is_excluded(directoryname, stdexcludes + category.excludes): return

    perms = line.split()[0]
    if not re.compile('^d......r.x').match(perms) or parent_dir(relativeDName) in category.unreadable_dirs:
        readable=False
        category.unreadable_dirs.add(relativeDName)

    try:
        perm, size, date, hms, filepath = line.split()
    except ValueError:
        raise
    ctime = ctime_from_rsync(date, hms)

    category_directories[relativeDName] = {'files':{}, 'isRepository':False, 'readable':readable, 'ctime':ctime, 'changed':True}
    if d.endswith('repodata'):
        parent_relativeDName = parent_dir(relativeDName)
        try:
            category_directories[parent_relativeDName]['isRepository'] = True
        except KeyError:
            category_directories[parent_relativeDName] = {'files':{}, 'isRepository':True, 'readable':readable, 'ctime':ctime, 'changed':True}
            
    return category_directories

def add_file_to_directory(line, category_directories):
    try:
        perm, size, date, hms, filepath = line.split()
    except ValueError:
        return
    try:
        dt = ctime_from_rsync(date, hms)
    except ValueError:
        return

    l = filepath.split('/')
    filename = l[-1]
    subpath = l[:-1]
    if len(subpath) > 0:
        relativeDName = os.path.join(*subpath)
    else:
        relativeDName = ''
    category_directories[relativeDName]['files'][filename] = {'size':size,
                                                              'stat':dt}

def short_filelist(files):
    html=0
    rpms=0
    hdrs=0
    drpms=0
    for f in files.keys():
        if f.endswith('.html'):  html += 1
        if f.endswith('.rpm'):   rpms += 1
        if f.endswith('.hdr'):   hdrs += 1
        if f.endswith('.drpm'): drpms += 1
    if html>10 or rpms > 10 or hdrs > 10 or drpms > 10:
        date_file_list = []
        rc = {}
        for k in files.keys():
            date_file_tuple = (files[k]['stat'], k, files[k]['size'])
            date_file_list.append(date_file_tuple)
        date_file_list.sort()
        # keep the most recent 3
        date_file_list = date_file_list[-3:]
        
        for stat, k, size in date_file_list:
            rc[k] = files[k]
        return rc
    else:
        return files




def sync_category_directories(diskpath, category, category_directories):
    for relativeDName in sorted(category_directories.keys()):
        value = category_directories[relativeDName]
        set_readable = False
        set_ctime = False
        set_files = False
        try:
            d = category.directory_cache[relativeDName]
            if d['readable'] != value['readable']: set_readable = True
            if d['ctime'] != value['ctime']:       set_ctime = True
            D = Directory.get(d['id'])
        except KeyError:
            if relativeDName == u'':
                D = category.topdir
            else:
                dname = os.path.join(category.topdir.name, relativeDName)
                D = Directory(name=dname, readable=value['readable'], ctime=value['ctime'])
                logger.debug(u'Created Directory(%s, readable=%s, ctime=%s)' % (dname, value['readable'], value['ctime']))
                D.addCategory(category)
            d = category.directory_cache[relativeDName] = D.sqlmeta.asDict()

        if value['changed']: set_files = True

        if (set_readable or set_ctime or set_files):
            if set_readable:
                D.readable = value['readable']
            if set_ctime:
                D.ctime = value['ctime']
            if set_files:
                if D.files != short_filelist(value['files']):
                    D.files = short_filelist(value['files'])
        make_file_details_from_checksums(diskpath, relativeDName, D)

    # this has to be a second pass to be sure the child repodata/ dir is created in the db first
    for relativeDName, value in category_directories.iteritems():
        d = category.directory_cache[relativeDName]
        D = Directory.get(d['id'])
        if value['isRepository']:
            make_repository(D, relativeDName, category)
        make_repomd_file_details(diskpath, relativeDName, D, category)
    Directory.ageFileDetails()


def parse_rsync_listing(category, f):
    topdirName = category.topdir.name
    category_directories = {}
    category.unreadable_dirs = set()
    while True:
        line = f.readline()
        if not line: break
        line.strip()
        l = line.split()
        if line.startswith('d') and len(l) == 5 and len(l[0]) == 10: # good guess it's a directory line
            if re.compile('^\.$').match(line): # we know category.topdir exists and isn't excluded
                pass
            else:
                category_directories = fill_category_directories_from_rsync(line, category, topdirName, category_directories)
        else:
            add_file_to_directory(line, category_directories)

    sync_category_directories(None, category, category_directories)

def sync_directories_using_rsync(rsyncpath, category):
    try:
        result, output = run_rsync(rsyncpath, category.extra_rsync_options)
    except:
        logger.warning('Failed to run rsync.', exc_info = True)
        return
    if result > 0:
        logger.info("rsync returned exit code %d for Category %s: %s" % (result,
                                                                                category.name,
                                                                                output))
    # still, try to use the output listing if we can
    parse_rsync_listing(category, output)

def sync_directories_from_file(filename, category):
    f = open(filename, 'r')
    parse_rsync_listing(None, category, f)
    f.close()

def cache_directories(category):
    cache = dict()
    topdirName = category.topdir.name
    for directory in list(category.directories):
        d = directory.sqlmeta.asDict()
        relativeDName = remove_category_topdir(topdirName, d['name']).strip(u'/')
        cache[relativeDName] = d
    return cache

def sync_directories_from_disk(diskpath, category, excludes=[]):
    category.unreadable_dirs = set()
    # drop any trailing slashes from diskpath
    diskpath = diskpath.rstrip(u'/')
    category_directories = {}

    for dirpath, dirnames, filenames in os.walk(diskpath, topdown=True):
        relativeDName = dirpath[len(diskpath) + 1:]
        relativeDName = relativeDName.strip(u'/')
        if is_excluded(relativeDName, stdexcludes + excludes):
            logger.info("excluding %s" % (relativeDName))
            # exclude all its subdirs too
            dirnames[:] = []
            continue


        # avoid disappearing directories
        try:
            s = os.stat(os.path.join(diskpath, relativeDName))
            ctime = s[stat.ST_CTIME]
        except OSError:
            continue

        try:
            d_ctime = category.directory_cache[relativeDName]['ctime']
        except KeyError:
            # we'll need to create it
            d_ctime = 0

        dirnames.sort()

        mode = s.st_mode
        readable = not not (mode & stat.S_IRWXO & (stat.S_IROTH|stat.S_IXOTH))
        if not readable or parent_dir(relativeDName) in category.unreadable_dirs:
            category.unreadable_dirs.add(relativeDName)
        isRepo = u'repodata' in dirnames

        changed = (d_ctime != ctime)
        if changed:
            logger.info("%s has changed: %s != %s" % (relativeDName, d_ctime, ctime))
        category_directories[relativeDName] = {'files':{}, 'isRepository':isRepo, 'readable':readable, 'ctime':ctime, 'changed':changed}

        # skip per-file stat()s if the directory hasn't changed
        if changed:
            for f in filenames:
                try:
                    s = os.stat(os.path.join(diskpath, relativeDName, f))
                except OSError:
                    continue
                category_directories[relativeDName]['files'][f] = {'size':str(s.st_size),
                                                                   'stat':s[stat.ST_CTIME]}

    sync_category_directories(diskpath, category, category_directories)

def main():
    global delete_directories
    global logger
    parser = optparse.OptionParser(usage=sys.argv[0] + " [options]")
    parser.add_option("-c", "--config",
                      dest="config", default='dev.cfg',
                      help="TurboGears config file to use")
    parser.add_option("--logfile",
                      dest="logfile", default='/var/log/mirrormanager/umdl.log',
                      metavar="FILE", help="write logs to FILE")
    parser.add_option("--debug",
                      dest="debug", default=False, action="store_true",
                      help="enable debugging")

    parser.add_option("--delete-directories",
                      dest="delete_directories", default=delete_directories, action="store_true",
                      help="delete directories from the database that are no longer on disk")

    (options, args) = parser.parse_args()
    turbogears.update_config(options.config, modulename="mirrormanager.config")
    delete_directories = options.delete_directories

    fmt = '%(asctime)s %(message)s'
    datefmt = '%m/%d/%Y %I:%M:%S %p'
    formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
    logger = logging.getLogger('umdl')
    handler = logging.handlers.WatchedFileHandler(options.logfile, "a+b")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    if options.debug:
        logger.setLevel(logging.DEBUG)


    if manage_pidfile(pidfile):
        logger.warning("another instance is running, try again later.")
        sys.exit(1)


    logger.info("Starting umdl")
    setup_arch_version_cache()
    for i in config.get('umdl.master_directories'):
        cname = i['category']
        try:
            category = Category.byName(cname)
        except SQLObjectNotFound:
            logger.error('umdl.master_directories Category %s does not exist in the database, skipping' % (cname))
            continue            

        if category.product is None:
            logger.error('umdl.master_directories Category %s has null Product, skipping' % (cname))
            continue            

        category.excludes = i.get('excludes', [])
        category.extra_rsync_options = i.get('options')
        category.directory_cache = cache_directories(category)

        if i['type'] == 'rsync':
            sync_directories_using_rsync(i['url'], category)

        if i['type'] == 'file':
            sync_directories_from_file(i['url'], category)

        if i['type'] == 'directory':
            sync_directories_from_disk(i['path'], category)

            if options.delete_directories:
                nuke_gone_directories(i['path'], category)

    remove_pidfile(pidfile)
    logger.info("Ending umdl")

    return 0

if __name__ == "__main__":
    sys.exit(main())
