rt lots of misc fixes

* Had to reconsider some unicode compromises * Add bz reference from RT * Rry to sanely represent literal stuffs and quoted * Add bugzilla reference with links * Comments missing metadata * Attachments with weird names / formats
author: cpettet <rush@wikimedia.org> 2014-12-12 00:50:09 -0600
committer: cpettet <rush@wikimedia.org> 2014-12-12 00:51:58 -0600
commit: 51d54f4a314f978aaf713a91e0df764aa5d58f90 (patch)
tree: cebe8fcd6cdd16fbad67bc74b5a89897b37c73fc /rt_create.py
parent: 9a30ffdaec03830966f408a4057da7ed276c6343 (diff)
1 files changed, 150 insertions, 51 deletions
diff --git a/rt_create.py b/rt_create.py
index e3c5447..8a202cd 100755
--- a/rt_create.py
+++ b/rt_create.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+#from __future__ import unicode_literals
 import time
 import json
 import os
@@ -52,8 +53,8 @@ def create(rtid):
     if current:
         import_priority, rtinfo, com, created, modified = current[0]
     else:
-        elog('%s not present for migration' % (rtid,))
-        return False
+        log('%s not present for migration' % (rtid,))
+        return 'missing'
 
     if not rtinfo:
         log("ignoring invalid data for issue %s" % (rtid,))
@@ -66,11 +67,49 @@ def create(rtid):
 
     if get_ref(rtid):
         log('reference ticket %s already exists' % (rtid,))
-        #return True
+        return True
 
     def remove_sig(content):
         return re.split('--\s?\n', content)[0]
 
+    def uob(obj, encoding='utf-8'):
+        """ unicode or bust"""
+        if isinstance(obj, basestring):
+            if not isinstance(obj, unicode):
+                obj = unicode(obj, encoding)
+        return obj
+
+    def sanitize_text(line):
+        if line.strip() and not line.lstrip().startswith('>'):
+            # in remarkup having '--' on a new line seems to bold last
+            # line so signatures really cause issues
+            if all(map(lambda c: c in '-', line.strip())):
+                return '%%%{0}%%%'.format(line.strip())
+            elif line.strip() == '-------- Original Message --------':
+                return '%%%{0}%%%'.format(line.strip())
+            elif line.strip() == '---------- Forwarded message ----------':
+                return '%%%{0}%%%'.format(unicode(line.strip()))
+            elif line.strip().startswith('#'):
+                return uob('%%%') + uob(line.strip()) + uob('%%%')
+            else:
+                return uob(line).strip()
+        elif line.strip().startswith('>'):
+            quoted_content = line.lstrip('>').strip()
+            if not quoted_content.lstrip('>').strip():
+                return line.strip()
+            if all(map(lambda c: c in '-', quoted_content.lstrip('>').strip())):
+                return "> ~~"
+            else:
+                return uob(line.strip())
+        else:
+            vlog("ignoring content line %s" % (line,))
+            return None
+
+    viewpolicy = phabdb.get_project_phid('WMF-NDA')
+    if not viewpolicy:
+        elog("View policy group not present: %s" % (viewpolicy,))
+        return False
+
     # Example:
     # id: ticket/8175/attachments\n
     # Attachments: 141490: (Unnamed) (multipart/mixed / 0b),
@@ -82,7 +121,6 @@ def create(rtid):
 
     history = response.get(path="ticket/%s/history?format=l" % (rtid,))
 
-
     rtinfo = json.loads(rtinfo)
     comments = json.loads(com)
     vlog(rtid)
@@ -99,7 +137,7 @@ def create(rtid):
             body, attached = attachsplit[0], attachsplit[1]
         else:
             body, attached = c, '0'
-        comment_dict[i]['text_body'] = body
+        comment_dict[i]['text_body'] = unicode(body)
         comment_dict[i]['attached'] = attached
 
     # Example:
@@ -184,30 +222,56 @@ def create(rtid):
         #    Private-l mailing list
         #    Private-l@lists.wikimedia.org
         #    https://lists.wikimedia.org/mailman/listinfo/private-l
+        if extract:
+            fdetails = extract.groups()
         if not extract and v.startswith('Attached Message Part'):
             continue
-        elif not extract:
-           raise Exception("no attachment extraction: %s %s (%s)" % (k, v, rtid))
-           continue
-        else:
-           vlog(extract.groups())
-           ainfo_ext[k] = extract.groups()
+        if not extract:
+            extract = re.match('(\S+)\s\((.*)\/(.*)\),.*', v)
+            if not extract:
+                elog("attachment CORRUPT or FAILED extraction: %s %s (%s)" % (k, v, rtid))
+                continue
+
+            fdetails = extract.group(1), '', extract.group(2), extract.group(3)
+
+        if not fdetails:
+            elog("attachment CORRUPT or FAILED extraction: %s %s (%s)" % (k, v, rtid))
+            continue
+        ainfo_ext[k] = fdetails
+        vlog(ainfo_ext[k])
 
+    # deb
+    # cgi
     attachment_types = ['pdf',
                         'jpeg',
+                        'asc',
                         'tgz',
+                        'csr',
                         'jpg',
                         'png',
                         'xls',
-                        'xlsx',
+                        'xls',
+                        'csv',
+                        'docx',
                         'gif',
                         'html',
                         'htm',
                         'txt',
+                        'diff',
                         'log',
                         'zip',
                         'rtf',
+                        'tmpl',
                         'vcf',
+                        'pub',
+                        'sql',
+                        'odt',
+                        'p7s',
+                        'iso',
+                        'ods',
+                        'conf',
+                        'doc',
+                        'xff',
                         'eml']
 
     #Uploading attachment
@@ -216,11 +280,13 @@ def create(rtid):
     uploaded = {}
     for k, v in ainfo_ext.iteritems():
         file_extension = v[1].lower()
+
         # vendors have this weird habit of capitalizing extension names
         # make sure we can handle the extension type otherwise
-        if file_extension not in attachment_types:
-            log("%s %s %s" % (rtid, v, file_extension))
-            raise Exception('unknown extension: %s (%s)' % (v, rtid))
+        #if file_extension not in attachment_types:
+        #    elog("Unknown Exception (%s) %s %s" % (rtid, v, file_extension))
+        #    #raise Exception('unknown extension: %s (%s)' % (v, rtid))
+
         full = "ticket/%s/attachments/%s/content" % (rtid, k)
         vcontent = response.get(path=full, headers={'Content-Type': v[2], 'Content-Length': v[3] })
         #PDF's don't react well to stripping header -- fine without it
@@ -230,12 +296,14 @@ def create(rtid):
             vcontent = vcontent.readlines()
             sanscontent = ''.join(vcontent[2:])
 
-        #{u'mimeType': u'image/jpeg', u'authorPHID': u'PHID-USER-bn2kbod4i7geycrbicns', 
-        #u'phid': u'PHID-FILE-ioj2mrujudkrekhl5pkl', u'name': u'0jp9B09.jpg',
-        #u'objectName': u'F25786', u'byteSize': u'120305',
-        #u'uri': u'http://fabapitest.wmflabs.org/file/data/t7j2qp7l5z4ou5qpbx2u/PHID-FILE-ioj2mrujudkrekhl5pkl/0jp9B09.jpg',
-        #u'dateCreated': u'1409345752', u'dateModified': u'1409345752', u'id': u'25786'}
-        upload = phabm.upload_file("%s.%s" % (v[0], file_extension), sanscontent)
+        if file_extension:
+            fname = "%s.%s" % (v[0], file_extension)
+        else:
+            fname = v[0]
+
+        upload = phabm.upload_file(fname,
+                                   sanscontent,
+                                   viewpolicy)
         uploaded[k] = upload
 
     if rtinfo['Queue'] not in rtlib.enabled:
@@ -243,6 +311,13 @@ def create(rtid):
         return True
 
     ptags = []
+
+    # In a practical sense ops-requets seemed to get tagged
+    # with straight Operations group in Phab so we backfill
+    # this for consistency.
+    if rtinfo['Queue'] == 'ops-requests':
+        ptags.append('operations')
+
     pname = rtlib.project_translate(rtinfo['Queue'])
     ptags.append(pname)
 
@@ -267,8 +342,13 @@ def create(rtid):
     # much like bugzilla comment 0 is the task description
     header = comment_dict[comment_dict.keys()[0]]
     del comment_dict[comment_dict.keys()[0]]
-    dtext = '\n'.join([l.strip() for l in header['body']['content'][0].splitlines()])
-    dtext = rtlib.shadow_emails(dtext)
+
+    dtext_san = []
+    dtext_list = header['body']['content'][0].splitlines()
+    for t in dtext_list:
+        dtext_san.append(sanitize_text(rtlib.shadow_emails(t)))
+    dtext = '\n'.join(filter(None, dtext_san))
+    #dtext = '\n'.join(filter(None, sanitize_text(rtlib.shadow_emails(dtext_list))))
     full_description = "**Author:** `%s`\n\n**Description:**\n%s\n" % (rtinfo['Creator'].strip(),
                                                                        dtext)
 
@@ -278,9 +358,18 @@ def create(rtid):
     for at in hafound:
         if at in upfiles:
             header_attachments.append('{F%s}' % uploaded[at]['id'])
-    if header_attachments:
+    if 'CF.{Bugzilla ticket}' in rtinfo or header_attachments: 
         full_description += '\n__________________________\n\n'
-        full_description += '\n'.join(header_attachments)
+        if 'CF.{Bugzilla ticket}' in rtinfo and rtinfo['CF.{Bugzilla ticket}']:
+            obzurl = 'https://old-bugzilla.wikimedia.org/show_bug.cgi?id='
+            obz = "[[ %s%s | %s ]]" % (obzurl,
+                                       rtinfo['CF.{Bugzilla ticket}'],
+                                       rtinfo['CF.{Bugzilla ticket}'],)
+            bzref = int(rtinfo['CF.{Bugzilla ticket}'].strip())
+            newbzref = bzref + 2000
+            full_description += "Bugzilla Ticket: %s => %s\n" % (obz, '{T%s}' % (newbzref,))
+        if header_attachments:
+            full_description += '\n'.join(header_attachments)
 
     vlog("Ticket Info: %s" % (full_description,))
     ticket =  phab.maniphest.createtask(title=rtinfo['Subject'],
@@ -288,8 +377,8 @@ def create(rtid):
                                         projectPHIDs=phids,
                                         ccPHIDs=[],
                                         priority=rtinfo['xpriority'],
-                                        auxiliary={"std:maniphest:external_reference":"rt%s" % (rtid,),
-                                                   "std:maniphest:security_topic":"%s" % ('none')})
+                                        auxiliary={"std:maniphest:external_reference":"rt%s" % (rtid,)})
+
     # XXX: perms
     botphid = phabdb.get_phid_by_username(config.phab_user)
     phabdb.set_task_title_transaction(ticket['phid'],
@@ -298,13 +387,13 @@ def create(rtid):
                                       'public')
 
     phabdb.set_task_ctime(ticket['phid'], rtlib.str_to_epoch(rtinfo['Created']))
+    phabdb.set_task_policy(ticket['phid'], viewpolicy)
 
-    vlog(str(ordered_comments))
+    #vlog(str(ordered_comments))
     fmt_comments = {}
     for comment, contents in comment_dict.iteritems():
         fmt_comment = {}
         dbody = contents['body']
-        print dbody
         if dbody['content'] is None and dbody['creator'] is None:
             continue
         elif dbody['content'] is None:
@@ -313,20 +402,15 @@ def create(rtid):
             mailsan = rtlib.shadow_emails(dbody['content'][0])
             content_literal = []
             for c in mailsan.splitlines():
-                if c.strip() and not c.lstrip().startswith('>'):
-                    # in remarkup having '--' on a new line seems to bold last
-                    # line so signatures really cause issues
-                    if c.strip() == '--':
-                        content_literal.append('%%%{0}%%%'.format(c.strip()))
-                    else:
-                        content_literal.append('{0}'.format(c.strip()))
-                elif c.strip():
-                    content_literal.append(c.strip())
-                else:
-                    vlog("ignoring content line %s" % (c,))
-            content = '\n'.join(content_literal)
-
-        if 'This transaction appears to have no content' in content:
+                content_literal.append(sanitize_text(c))
+            content = '\n'.join(filter(None, content_literal))
+
+            # In case of attachment but not much else
+            if not content and dbody['attached']:
+                content = True
+
+        void_content = 'This transaction appears to have no content'
+        if not content == True and void_content in content:
             content = None
 
         auto_actions = ['Outgoing email about a comment recorded by RT_System',
@@ -339,8 +423,13 @@ def create(rtid):
         preamble = ''
         cbody = ''
         if content:
+            if dbody['creator'] is None:
+                dbody['creator'] = '//creator field not set in source//'
             preamble += "`%s  wrote:`\n\n" % (dbody['creator'].strip(),)
-            cbody += "%s" % (content.strip() or 'no content',)
+
+            if content == True:
+                content = ''
+            cbody += "%s" % (content.strip() or '//no content//',)
 
         
         if dbody['nvalue'] or dbody['ovalue']:
@@ -376,7 +465,16 @@ def create(rtid):
             fmt_comment['xattached'] = cbody_attachments
         phabm.task_comment(ticket['id'], preamble + cbody)
         ctransaction = phabdb.last_comment(ticket['phid'])
-        created = rtlib.str_to_epoch_comments(dbody['created'])
+
+        try:    
+            created = rtlib.str_to_epoch_comments(dbody['created'])
+        except (ValueError, TypeError):
+            # A handful of issues seems to show NULL creation times
+            # for now reason: see 1953 for example of NULL
+            # 3001 for example of None
+            elog("Could not determine comment time for %s" % (rtid,))
+            dbody['created'] = rtlib.str_to_epoch(rtinfo['Created'])
+
         phabdb.set_comment_time(ctransaction,
                                 created)
         fmt_comment['xctransaction'] = ctransaction
@@ -442,17 +540,18 @@ def main():
     pmig.close()
 
     #Serious business
-    if 'failed' in sys.argv:
+    if 'failed' in sys.argv or '-r' in sys.argv:
         for b in bugs:
-            notice("Removing rtid %s" % (b,))
-            log(util.remove_issue_by_bugid(b, bzlib.prepend))
+            util.notice("Removing rtid %s" % (b,))
+            log(util.remove_issue_by_bugid(b, rtlib.prepend))
 
     from multiprocessing import Pool
     pool = Pool(processes=int(config.bz_createmulti))
     _ =  pool.map(run_create, bugs)
-    complete = len(filter(bool, _))
-    failed = len(_) - complete
-    print '%s completed %s, failed %s' % (sys.argv[0], complete, failed)
+    missing = len([i for i in _ if i == 'missing'])
+    complete = len(filter(bool, [i for i in _ if i not in ['missing']]))
+    failed = (len(_) - missing) - complete
+    print '%s completed %s, missing %s, failed %s' % (sys.argv[0], complete, missing, failed)
 
 if __name__ == '__main__':
     main()
author	cpettet <rush@wikimedia.org>	2014-12-12 00:50:09 -0600
committer	cpettet <rush@wikimedia.org>	2014-12-12 00:51:58 -0600
commit	51d54f4a314f978aaf713a91e0df764aa5d58f90 (patch)
tree	cebe8fcd6cdd16fbad67bc74b5a89897b37c73fc /rt_create.py
parent	9a30ffdaec03830966f408a4057da7ed276c6343 (diff)