#!/usr/bin/python
__requires__ = 'TurboGears[future]'
import pkg_resources
pkg_resources.require("TurboGears")

import datetime as dt
import errno
import logging
from optparse import OptionParser
import os
import signal
from subprocess import Popen
import sys
import time
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning) 
    import turbogears

sys.path.append('/usr/share/mirrormanager/server')
from mirrormanager.model import Host
from mirrormanager.lib import manage_pidfile, remove_pidfile

pidfile='/var/run/mirrormanager/crawler.pid'
options = None

from turbogears.database import PackageHub
hub = __connection__ = None
logger = None

def total_seconds(td):
    return td.seconds + td.days * 24 * 3600

class ForkingMaster:
    def __init__(self, max_children = 10):
        self.active_children = []
        self.max_children = max_children
        self.devnull = open('/dev/null', 'r+')
        self.timings = {}


    def check_timedout_children(self):
        now = dt.datetime.utcnow()
        for child in self.active_children:
            if child.kill_time < now:
                try:
                    os.kill(child.pid, signal.SIGKILL) # SIGTERM wasn't enough
                    logger.info('Killed process %d' % child.pid)
                except: # the process could be gone
                    pass
            else:
                # items lower on this list are newer, no need to check
                break
        return None

    def collect_children(self):
        """Internal routine to wait for died children."""
        options = os.WNOHANG
        while True:
            try:
                pid, status = os.waitpid(-1, options)
            except OSError as e:
                logger.debug(str(e))
                if e.errno == errno.ECHILD:
                    logger.debug("Got ECHILD.  Number of active children: %d/%d" % (len(self.active_children), self.max_children)) 
                    return False
                pid = None

            if not pid: 
                # no child was ready, see if any should be killed
                self.check_timedout_children()
                return
            # a child should be reaped
            for p in self.active_children:
                if p.pid == pid:
                    self.stop_time(p)
                    logger.debug("Removing child pid %d, leaving %d active children" % (p.pid, len(self.active_children)-1))
                    self.active_children.remove(p)
        return True

    def process_request(self, command, args, host):
        """Fork a new subprocess to process the request."""
        logger.info("Starting crawler %s: %s" % (host.name, args))
        stderr = open(os.path.join(options.logdir, "%d-stderr.log" % host.id), 'a')
        p = Popen(args, executable=command, stdin=self.devnull, stdout=self.devnull, stderr=stderr, close_fds=True)
        self.start_time(p, host.id)
        logger.debug("Adding child pid %d" % p.pid)
        self.active_children.append(p)
        logger.debug("Number of active children now: %d" % len(self.active_children))

    def wait_for_available_slot(self):
        logger.debug("Waiting for a slot: Number of active children: %d/%d" % (len(self.active_children), self.max_children))
        while len(self.active_children) >= self.max_children:
            self.collect_children()
            time.sleep(1)

    def wait_for_completion(self):
        self.max_children = 0
        while len(self.active_children):
            self.collect_children()
            time.sleep(1)

    def start_time(self, p, hostid):
        now = dt.datetime.utcnow()
        p.kill_time = now + dt.timedelta(seconds=(options.timeout_minutes * 60))
        self.timings[p.pid] = dict()
        self.timings[p.pid]['start'] = now
        self.timings[p.pid]['hostid'] = hostid

    def stop_time(self, p):
        self.timings[p.pid]['stop'] = dt.datetime.utcnow()

        diff = self.timings[p.pid]['stop'] - self.timings[p.pid]['start']
        host = Host.get(self.timings[p.pid]['hostid'])
        logger.info('Host %s (id=%s) crawl time %s' % (host.name, host.id, str(diff)))
        try:
            seconds = int(diff.total_seconds())
        except AttributeError: # python < 2.7
            seconds = total_seconds(diff)
        host.last_crawl_duration = seconds
        del self.timings[p.pid]

def doit():
    master = ForkingMaster(max_children=options.threads)
    commonargs = [ options.crawler_perhost, '-c', options.config]
    if options.canary:
        commonargs.append('--canary')

    numhosts = Host.selectBy(private=False).count()
    i = 0
    for h in list(Host.selectBy(private=False).orderBy('-last_crawl_duration')):
        i += 1
        try:
            if h.id < options.startid: continue
            if h.id >= options.stopid: continue
            master.wait_for_available_slot()

            if not (h.admin_active and h.user_active and h.site.user_active and h.site.admin_active):
                continue
            if h.site.private:
                continue
            
            hostargs = []
            hostargs.extend(['--hostid', str(h.id)])
            logfilename = os.path.join(options.logdir, str(h.id) + '.log')
            hostargs.extend(['--logfile', logfilename])
            args = commonargs + hostargs
            logger.debug('starting crawler for host %s (id=%d) %d/%d' % (h.name, h.id, i, numhosts))
            master.process_request(options.crawler_perhost, args, h)
        except AssertionError: # someone deleted our host while we were looking at it
            continue
        
    master.wait_for_completion()

def setup_logger(debug):
    fmt = "%(asctime)s %(message)s"
    formatter = logging.Formatter(fmt=fmt)
    formatter.converter = time.gmtime
    logger = logging.getLogger('crawler')
    handler = logging.handlers.WatchedFileHandler(options.logfile, "a+b")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    return logger

def main():
    global options
    if manage_pidfile(pidfile):
        print "another instance is running, try again later."
        return 1

    parser = OptionParser(usage=sys.argv[0] + " [options]")
    parser.add_option("-c", "--config",
                      dest="config", default='/etc/mirrormanager/prod.cfg',
                      help="TurboGears config file to use")

    parser.add_option("--include-private",
                      action="store_true", dest="include_private", default=False,
                      help="Include hosts marked 'private' in the crawl")

    parser.add_option("-t", "--threads", type="int",
                      dest="threads", default=10,
                      help="max threads to start in parallel")
    parser.add_option("-l", "--logdir", type="string", metavar="DIR",
                      dest="logdir", default='/var/log/mirrormanager/crawler',
                      help="write individual host logfiles to DIR")
    parser.add_option("--timeout-minutes", type="int",
                      dest="timeout_minutes", default=120,
                      help="per-host timeout, in minutes")
    parser.add_option("--logfile", type="string", metavar="FILE",
                      dest="logfile", default='/var/log/mirrormanager/crawler.log',
                      help="write logfile to FILE")
    parser.add_option("--startid", type="int", metavar="ID",
                      dest="startid", default=0,
                      help="Start crawling at host ID (default=0)")
    parser.add_option("--stopid", type="int", metavar="ID",
                      dest="stopid", default=sys.maxint,
                      help="Stop crawling before host ID (default=maxint)")
    parser.add_option("--crawler_perhost", type="string", metavar="FILE",
                      dest="crawler_perhost", default='/usr/share/mirrormanager/server/crawler_perhost',
                      help="Per-host crawler executable (default=/usr/share/mirrormanager/server/crawler_perhost")

    parser.add_option("--canary", 
                      dest="canary", action="store_true", default=False,
                      help="fast crawl by only scanning for canary files")

    parser.add_option("--debug", "-d",
                      dest="debug", action="store_true", default=False,
                      help="enable printing of debug-level messages")

    (options, args) = parser.parse_args()

    turbogears.update_config(configfile=options.config,
                             modulename="mirrormanager.config")
    global hub
    global __connection__
    hub = PackageHub("mirrormanager")
    __connection__ = hub

    global logger
    logger = setup_logger(options.debug)
    
    doit()
    remove_pidfile(pidfile)
    return 0

if __name__ == "__main__":
    sys.exit(main())
