# -------------------------------------------
# Copyright 2010, Matte Matik
#
# This file is released under GNU General Public License v3
# http://www.gnu.org/licenses/gpl-3.0.html
# -------------------------------------------

# -*- coding: utf-8 -*-
import sys
import urllib2
import re
import getopt
import traceback

#------------------------------------------------------------
# Default transition
#------------------------------------------------------------
def antisemittransition(html):
    # Convert men to something different
    converter = [
        [r'\ben man\b','en jude'],
        [r'\bmannen\b','juden'],
        [r'mannens\b','judens'],
        [r'\bmanliga\b','judiska'],
        [r'\bmanlige\b','judiske'],
        [r'\bman\.','jude.'],
        # Capitalized
        [r'\bEn man\b','En jude'],
        [r'\bMannen\b','Juden'],
        [r'\bManliga\b','Judiska'],

        [r'm&auml;nnen\b','judarna'],
        [r'm&auml;nnens\b','judarnas'],
        [r'm&auml;n\b','judar'],
        [r'm&auml;ns\b','judars'],
        [r'M&auml;nnen\b','Judarna'],
        [r'M&auml;nnens\b','Judarnas'],
        [r'M&auml;n\b','Judar'],
        [r'M&auml;ns\b','Judars'],

        [r'\bkvinna\b','svensk'],
        [r'\bkvinnan\b','svensken'],
        [r'\bkvinnliga\b','svenska'],
        [r'kvinnorna\b','svenskarna'],
        [r'kvinnornas\b','svenskarnas'],
        [r'\bkvinnans\b','svenskens'],
        [r'\bkvinnor\b','svenskar'],
        [r'\bkvinnors\b','svenskars'],
        [r'\bkvinnlighet\b','svenskhet'],

        [r'\bKvinna\b','Svensk'],
        [r'\bKvinnan\b','Svensken'],
        [r'\bKvinnliga\b','Svenska'],
        [r'\bKvinnorna\b','Svenskarna'],
        [r'\bKvinnornas\b','svenskarnas'],
        [r'\bKvinnans\b','Svenskens'],
        [r'\bKvinnor\b','Svenskar'],
        [r'\bKvinnors\b','svenskars'],
        [r'\bKvinnlighet\b','Svenskhet'],

        # Flickor, pojkar
        [r'\ben skolflicka\b','ett svenskt barn'],
        [r'\ben flicka\b','ett svenskt barn'],
        [r'\bflickan\b','det svenska barnet'],

        [r'\bflickorna\b','de svenska barnen'],
        [r'\bflickornas\b','de svenska barnens'],
        [r'\bflickor\b','svenska barn'],
        [r'\bflickors\b','svenska barns'],

        [r'\ben tjej\b','ett svenskt barn'],
        [r'\btjejen\b','det svenska barnet'],

        [r'\btjejerna\b','de svenska barnen'],
        [r'\btjejernas\b','de svenska barnens'],
        [r'\btjejer\b','svenska barn'],
        [r'\btjejers\b','svenska barns'],

        [r'\ben pojke\b','ett judiskt barn'],
        [r'\bpojken\b','det judiska barnet'],

        [r'\bpojkarna\b','de judiska barnen'],
        [r'\bpojkarnas\b','de judiska barnens'],
        [r'\bpojkar\b','judiska barn'],
        [r'\bpojkars\b','judiska barns'],

        [r'\bEn flicka\b','Ett svenskt barn'],
        [r'\bFlickan\b','Det svenska barnet'],

        [r'\bFlickorna\b','De svenska barnen'],
        [r'\bFlickornas\b','De svenska barnens'],
        [r'\bFlickor\b','Svenska barn'],
        [r'\bFlickors\b','Svenska barns'],

        [r'\bEn tjej\b','Ett svenskt barn'],
        [r'\bTjejen\b','Det svenska barnet'],

        [r'\bTjejerna\b','De svenska barnen'],
        [r'\bTjejernas\b','De svenska barnens'],
        [r'\bTjejer\b','Svenska barn'],
        [r'\bTjejers\b','Svenska barns'],

        [r'\bEn pojke\b','Ett judiskt barn'],
        [r'\bPojken\b','Det judiska barnet'],

        [r'\bPojkarna\b','De judiska barnen'],
        [r'\bPojkarnas\b','De judiska barnens'],
        [r'\bPojkar\b','Judiska barn'],
        [r'\bPojkars\b','Judiska barns'],

        # Genus -> Ras
        [r'\bgenus','ras'],
        [r'\bk&ouml;n\b','ras'],
        [r'\bk&ouml;nen\b','raserna'],
        [r'\bk&ouml;ns','ras'],

        [r'\bGenus','Ras'],
        [r'\bK&ouml;n\b','ras'],
        [r'\bK&ouml;nen\b','Raserna'],
        [r'\bK&ouml;ns','ras'],

        #Suspekta:
        [r'\bkvinno','svensk-'],
        [r'\bmans','jude'],
        [r'\bj&auml;mst&auml;llde\b','integrerade'],
        [r'\bj&auml;mst&auml;lldhet\b','rasbalans'],
        [r'\bj&auml;mst&auml;lldhets','rasbalans'],
        [r'\bmanlighet','judiskhet'],
        [r'\bfeminist','rasbiolog'],
        [r'\bfeminisit','rasbiolog'],
        [r'\bgubbe\b','jude'],
        [r'\bgubb','jude'],
        [r'\blesbiska\b','svenskälskande'],
        [r'\blesbisk\b','svenskälskare'],

        [r'\bKvinno','svensk-'],
        [r'\bMans','jude'],
        [r'\bJ&auml;mst&auml;lldhet','Rasbalans'],
        [r'\bJ&auml;mst&auml;lldhets','Rasbalans'],
        [r'\bManlighet','Judiskhet'],
        [r'\bFeminist','Rasbiolog'],
        [r'\bFeminisit','Rasbiolog'],
        [r'\bGubbe\b','Jude'],
        [r'\bGubb','Invandrar'],
        [r'\bLesbiska\b','Svenskälskande'],
        [r'\bLesbisk\b','Svenskälskare'],


        [r'\bdamlag','Sverigelag']

    ]
    return transformation(html, converter)

#------------------------------------------------------------
def invandrartransition(html):
    # Convert men to something different
    converter = [
        [r'\ben man\b','en invandrare'],
        [r'\bmannen\b','invandraren'],
        [r'mannens\b','invandrarens'],
        [r'\bmanliga\b','invandrarspecifika'],
        [r'\bmanlige\b','invandrarspecifike'],
        # Capitalized
        [r'\bEn man\b','En invandrare'],
        [r'\bMannen\b','Invandraren'],
        [r'\bManliga\b','Invandraraktiga'],

        [r'm&auml;nnen\b','invandrarna'],
        [r'm&auml;nnens\b','invandrarnas'],
        [r'm&auml;n\b','invandrare'],
        [r'm&auml;ns\b','invandrares'],

        [r'M&auml;nnen\b','Invandrarna'],
        [r'M&auml;nnens\b','Invandrarnas'],
        [r'M&auml;n\b','Invandrare'],
        [r'M&auml;ns\b','Invandrares'],

        [r'\bkvinna\b','svensk'],
        [r'\bkvinnan\b','svensken'],
        [r'\bkvinnliga\b','svenska'],
        [r'kvinnorna\b','svenskarna'],
        [r'kvinnornas\b','svenskarnas'],
        [r'\bkvinnans\b','svenskens'],
        [r'\bkvinnor\b','svenskar'],
        [r'\bkvinnors\b','svenskars'],
        [r'\bkvinnlighet\b','svenskhet'],

        [r'\bKvinna\b','Svensk'],
        [r'\bKvinnan\b','Svensken'],
        [r'\bKvinnliga\b','Svenska'],
        [r'\bKvinnorna\b','Svenskarna'],
        [r'\bKvinnornas\b','svenskarnas'],
        [r'\bKvinnans\b','Svenskens'],
        [r'\bKvinnor\b','Svenskar'],
        [r'\bKvinnors\b','svenskars'],
        [r'\bKvinnlighet\b','Svenskhet'],

        # Flickor, pojkar
        [r'\ben skolflicka\b','ett svenskt barn'],
        [r'\ben flicka\b','ett svenskt barn'],
        [r'\bflickan\b','det svenska barnet'],

        [r'\bflickorna\b','de svenska barnen'],
        [r'\bflickornas\b','de svenska barnens'],
        [r'\bflickor\b','svenska barn'],
        [r'\bflickors\b','svenska barns'],

        [r'\ben tjej\b','ett svenskt barn'],
        [r'\btjejen\b','det svenska barnet'],

        [r'\btjejerna\b','de svenska barnen'],
        [r'\btjejernas\b','de svenska barnens'],
        [r'\btjejer\b','svenska barn'],
        [r'\btjejers\b','svenska barns'],

        [r'\ben pojke\b','ett invandrarbarn'],
        [r'\bpojken\b','invandrarbarnet'],
        [r'\bpojkarna\b','invandrarbarnen'],
        [r'\bpojkarnas\b','invandrarbarnens'],
        [r'\bpojkar\b','invandrarbarn'],
        [r'\bpojkars\b','invandrarbarns'],

        [r'\bEn flicka\b','Ett svenskt barn'],
        [r'\bFlickan\b','Det svenska barnet'],

        [r'\bFlickorna\b','De svenska barnen'],
        [r'\bFlickornas\b','De svenska barnens'],
        [r'\bFlickor\b','Svenska barn'],
        [r'\bFlickors\b','Svenska barns'],

        [r'\bEn tjej\b','Ett svenskt barn'],
        [r'\bTjejen\b','Det svenska barnet'],

        [r'\bTjejerna\b','De svenska barnen'],
        [r'\bTjejernas\b','De svenska barnens'],
        [r'\bTjejer\b','Svenska barn'],
        [r'\bTjejers\b','Svenska barns'],

        [r'\bEn pojke\b','Ett invandrarbarn'],
        [r'\bPojken\b','Invandrarbarnet'],
        [r'\bPojkarna\b','Invandrarbarnen'],
        [r'\bPojkarnas\b','Invandrarbarnens'],
        [r'\bPojkar\b','Invandrarbarn'],
        [r'\bPojkars\b','Invandrarbarns'],


        # Genus -> Ras
        [r'\bgenus','invandrings'],
        [r'\bk&ouml;n\b','folkgrupp'],
        [r'\bk&ouml;nen\b','folkgrupperna'],
        [r'\bk&ouml;ns','folkgrupps'],

        [r'\bGenus','Ras'],
        [r'\bK&ouml;n\b','ras'],
        [r'\bK&ouml;nen\b','Raserna'],
        [r'\bK&ouml;ns','ras'],

        #Suspekta:
        [r'\bkvinno','svensk-'],
        [r'\bmans','invandrar'],
        [r'\bj&auml;mst&auml;llde\b','integrerade'],
        [r'\bj&auml;mst&auml;lldhet\b','rasbalans'],
        [r'\bj&auml;mst&auml;lldhets','rasbalans'],
        [r'\bmanlighet','invandrar'],
        [r'\bfeminist','rasbiolog'],
        [r'\bfeminisit','rasbiolog'],
        [r'\bgubb','invandrar'],
        [r'\bgubbe\b','jude'],
        [r'\blesbiska\b','svenskälskande'],
        [r'\blesbisk\b','svenskälskare'],

        [r'\bKvinno','Svensk-'],
        [r'\bMans','Invandrar'],
        [r'\bJ&auml;mst&auml;lldhet\b','Rasbalans'],
        [r'\bJ&auml;mst&auml;lldhets','Rasbalans'],
        [r'\bManlighet','Invandrar'],
        [r'\bFeminist','Rasbiolog'],
        [r'\bFeminisit','Rasbiolog'],
        [r'\bGubb','Invandrar'],
        [r'\bGubbe\b','Jude'],
        [r'\bLesbiska\b','Svenskälskande'],
        [r'\bLesbisk\b','Svenskälskare'],

        [r'\bdamlag','Sverigelag']

    ]
    return transformation(html, converter)

#------------------------------------------------------------
def transformation(html, converter):
    html = html
    for c in converter:
        regexp = re.compile(c[0], re.VERBOSE)
        tmp = regexp.sub(c[1], html)

        if tmp != html:
            html = tmp

    return html

#------------------------------------------------------------
def printusage():
    print "transgenus.py [-u <url> | -f filename -r siteroot] -o outfile [-c conversiontype] [-d decode (f.ex. iso-8859-1) ]"
#------------------------------------------------------------
def main(url, tmpfilename, siteroot, outfile, conversiontype, decode="utf-8"):
    tmpfile = None
    html = None
    if url != None:
        siteroot = url[0:url.rfind("/")]
        #print "Siteroot: " + siteroot
        #print "Url: " + str(url)
        response = urllib2.urlopen(url)
        html = response.read()
    else:
        print "Opening " + str(tmpfilename) + ", site root: " + str(siteroot)
        html = file(tmpfilename).read()
    if siteroot == None:
        print "No site root specified!"
        printusage()
        sys.exit(1)

    d1 = re.compile("http://(.*?)/").match(siteroot)
    if d1 != None:
        domain = d1.group(0)
        domain = domain[0:len(domain) - 1]
    else:
        domain = siteroot
    #print siteroot
    #print "Domain: " + domain

    # Swedish chars
    if decode != "utf-8":
        #html = html.decode("iso-8859-1").encode("utf-8")
        html = html.decode(decode).encode("utf-8")
    for r in [ 
        ['å' ,'&aring;'],
        ['ä' ,'&auml;'],
        ['ö' ,'&ouml;'],
        ['Å' ,'&Aring;'],
        ['Ä' ,'&Auml;'],
        ['Ö' ,'&Ouml;']
        ]:
        new = re.compile(r[0]).sub(r[1],html)
        if new != html:
            html = new
            #print new

    #for c in converter:
    #    re.compile("(" + c[0] + ")").sub(c[1], html);

    # Convert links
    #regexp = re.compile("<(.*)\\b(href|src)=([\"|\'])(.*?)([\"|\'])(.*)>", re.MULTILINE)
    #regexp = re.compile("<(a|link|img|script|iframe)\\b([^>]*?)(href|src)(=[\"|\']?)/(.*?)([\"|\'|\\s])(.*?)>", re.MULTILINE)
    #html = regexp.sub("<\\1\\2\\3\\4" + domain + "/\\5" + "\\6\\7>", html)
    regexp = re.compile("<(a|link|img|script|iframe)\\b([^>]*?)(href|src)(=[\"|\']?)(.*?)([\"|\'|\\s])(.*?)>", re.MULTILINE)

    split = regexp.split(html)
    if split == None or len(split) == 1:
        split = html
    else:
        i = 1
        newhtml = ""
        while i < len(split):
            newhtml = newhtml + split[i - 1]
            #print split[i - 1]
            #print str(i) + ":" + split[i] + "|" + split[i+1] + "|" + split[i+2] + "|"  + split[i+3] + "|"  + split[i+4] + "|"  + domain + "/" + "|"  + split[i+5] + "|"  + split[i+6] 
            # 3 is the url
            url = split[i + 4]
            #print "URL: '" + url + "'"
            newurl = ""
            if url[0:1] == '/':
                newurl = domain + url
            elif url[0:7] == "http://":
                newurl = url
            else:
                newurl = siteroot + "/" + url
            newtag = "<" + split[i] + split[i+1] + split[i+2] + split[i+3] + newurl + split[i+5] + split[i+6] + ">"
            #print newtag
            newhtml = newhtml + newtag 
            #print
            #print s[i+7]
            i = i + 8

        newhtml = newhtml + split[i - 1]
        html = newhtml

    # Debug code:
    #print regexp.search(html).groups()
    #iterator = regexp.finditer(html)
    #for match in iterator:
    #    s = ""
    #    j = 0
    #    for m in match.groups():
    #        j = j +1
    #        if j == 0:
    #            continue
    #        s = s + str(j) + " ='" + m + "'\n"
    #    print s
    #    #print match.groups()
    #    print
    #sys.exit(0)

    # Make the actual conversion
    result = None
    if conversiontype == 'antisemit':
        result = antisemittransition(html)
    elif conversiontype == 'invandrare':
        result =  invandrartransition(html)
    else:
        print "Unknown conversion type: " + str(conversiontype)
        return None

    if result != None:
        if outfile != None:
            f = file(outfile,'w')
            f.write(result)
            f.close()
        else:
            print result



if __name__ == '__main__':
    if len(sys.argv) < 3:
        printusage()
        sys.exit(1)

    opts, args = getopt.getopt(sys.argv[1:],"f:r:o:u:c:d:")
    url = None
    filename = None
    siteroot = None
    outfile = None
    conversiontype = 'antisemit'
    decode = 'utf-8'

    for o,a in opts:
        if o == "-u":
            url = a
        elif o == "-f":
            filename = a
        elif o == "-r":
            siteroot = a
        elif o == "-o":
            outfile = a
        elif o == "-c":
            conversiontype = a
        elif o == "-d":
            decode = a
            

    main(url, filename, siteroot, outfile, conversiontype, decode)

