From 3b3f86b71e71b4adc7a48486ee9881276a4ba27c Mon Sep 17 00:00:00 2001 From: chrisjbillington Date: Wed, 25 Mar 2020 12:31:16 -0400 Subject: [PATCH] Allow utf8 in mappings We were previously processing entries in mapping files (when `--mappings-are-raw` is not given) with `.decode('unicode_escape').encode('utf8')` to replace backslash escape sequences in bytestrings with the utf-8 encoded characters they represent. However, it turns out that `.decode ('unicode_escape')` assumes latin-1 encoding if it encounters non-ascii bytes: https://bugs.python.org/issue21331. So this gave incorrect results if non-ascii utf8 data was present in the mapping. To fix this, we now add an extra layer of `.decode('utf8').encode ('unicode-escape')` in order to convert any non-ascii characters into their backslash escape sequences. Then the subsequent `.decode('unicode_escape')` only encounters ascii characters and gives correct results. --- README.md | 8 ++++---- hg-fast-export.py | 14 +++++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7cecb98..31610a8 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,10 @@ author information than git, an author mapping file can be given to hg-fast-export to fix up malformed author strings. The file is specified using the -A option. The file should contain lines of the form `""=""`. Inside the key and value strings, all escape -sequences understood by the python `string_escape` encoding are -supported. (Versions of fast-export prior to v171002 had a different -syntax, the old syntax can be enabled by the flag -`--mappings-are-raw`.) +sequences understood by the python `unicode_escape` encoding are +supported; strings are otherwise assumed to be UTF8-encoded. +(Versions of fast-export prior to v171002 had a different syntax, the +old syntax can be enabled by the flag `--mappings-are-raw`.) The example authors.map below will translate `User ` to `User `. diff --git a/hg-fast-export.py b/hg-fast-export.py index 5fd7fb3..f5ac5bf 100755 --- a/hg-fast-export.py +++ b/hg-fast-export.py @@ -426,12 +426,20 @@ def load_mapping(name, filename, mapping_is_raw): return None return (m.group(1).strip(), m.group(2).strip()) + def process_unicode_escape_sequences(s): + # Replace unicode escape sequences in the otherwise UTF8-encoded bytestring s with + # the UTF8-encoded characters they represent. We need to do an additional + # .decode('utf8').encode('unicode-escape') to convert any non-ascii characters into + # their escape sequences so that the subsequent .decode('unicode-escape') succeeds: + return s.decode('utf8').encode('unicode-escape').decode('unicode-escape').encode('utf8') + def parse_quoted_line(line): m=quoted_regexp.match(line) if m==None: - return None - return (m.group(1).decode('unicode_escape').encode('utf8'), - m.group(5).decode('unicode_escape').encode('utf8')) + return + + return (process_unicode_escape_sequences(m.group(1)), + process_unicode_escape_sequences(m.group(5))) cache={} if not os.path.exists(filename): -- 2.11.4.GIT