From 22f7803029ebf6a85d23806fc042742f90872043 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sat, 16 Nov 2024 18:55:46 +0100 Subject: [PATCH 01/14] Support multi-argument gettext functions --- Lib/test/test_tools/i18n_data/messages.pot | 49 +++-- Lib/test/test_tools/i18n_data/messages.py | 40 +++- Lib/test/test_tools/test_i18n.py | 4 +- Tools/i18n/pygettext.py | 202 ++++++++++++++------- 4 files changed, 213 insertions(+), 82 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index ddfbd18349ef4f..00b49ebb6d17d9 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -15,53 +15,78 @@ msgstr "" "Generated-By: pygettext.py 1.5\n" -#: messages.py:5 +#: messages.py:16 msgid "" msgstr "" -#: messages.py:8 messages.py:9 +#: messages.py:19 messages.py:20 msgid "parentheses" msgstr "" -#: messages.py:12 +#: messages.py:23 msgid "Hello, world!" msgstr "" -#: messages.py:15 +#: messages.py:26 msgid "" "Hello,\n" " multiline!\n" msgstr "" -#: messages.py:29 +#: messages.py:37 +msgid "kwargs work!" +msgstr "" + +#: messages.py:38 messages.py:81 messages.py:82 messages.py:85 messages.py:86 +msgid "foo" +msgid_plural "foos" +msgstr[0] "" +msgstr[1] "" + +#: messages.py:39 +msgid "something" +msgstr "" + +#: messages.py:42 msgid "Hello, {}!" msgstr "" -#: messages.py:33 +#: messages.py:43 +msgid "Hello, {}!world" +msgstr "" + +#: messages.py:46 msgid "1" msgstr "" -#: messages.py:33 +#: messages.py:46 msgid "2" msgstr "" -#: messages.py:34 messages.py:35 +#: messages.py:47 messages.py:48 msgid "A" msgstr "" -#: messages.py:34 messages.py:35 +#: messages.py:47 messages.py:48 msgid "B" msgstr "" -#: messages.py:36 +#: messages.py:49 msgid "set" msgstr "" -#: messages.py:42 +#: messages.py:55 msgid "nested string" msgstr "" -#: messages.py:47 +#: messages.py:60 msgid "baz" msgstr "" +#: messages.py:83 messages.py:84 messages.py:87 messages.py:88 +msgctxt "context" +msgid "foo" +msgid_plural "foos" +msgstr[0] "" +msgstr[1] "" + diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index f220294b8d5c67..02643fed57ed2e 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -1,5 +1,16 @@ # Test message extraction -from gettext import gettext as _ +from gettext import ( + gettext, + ngettext, + pgettext, + npgettext, + dgettext, + dngettext, + dpgettext, + dnpgettext +) + +_ = gettext # Empty string _("") @@ -21,13 +32,15 @@ _(None) _(1) _(False) -_(x="kwargs are not allowed") + +# Unusual, but valid arguments +_(x="kwargs work!") _("foo", "bar") _("something", x="something else") # .format() _("Hello, {}!").format("world") # valid -_("Hello, {}!".format("world")) # invalid +_("Hello, {}!".format("world")) # should be invalid, but is extracted (also by xgettext and pybabel) # Nested structures _("1"), _("2") @@ -62,3 +75,24 @@ def _(x): def _(x="don't extract me"): pass + + +# Other gettext functions +gettext("foo") +ngettext("foo", "foos", 1) +pgettext("context", "foo") +npgettext("context", "foo", "foos", 1) +dgettext("domain", "foo") +dngettext("domain", "foo", "foos", 1) +dpgettext("domain", "context", "foo") +dnpgettext("domain", "context", "foo", "foos", 1) + +# Invalid calls which are not extracted +gettext() +ngettext('foo') +pgettext('context') +npgettext('context', 'foo') +dgettext('domain') +dngettext('domain', 'foo') +dpgettext('domain', 'context') +dnpgettext('domain', 'context', 'foo') diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index 6f71f0976819f1..29c3423e234d20 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -332,14 +332,14 @@ def test_calls_in_fstring_with_multiple_args(self): msgids = self.extract_docstrings_from_str(dedent('''\ f"{_('foo', 'bar')}" ''')) - self.assertNotIn('foo', msgids) + self.assertIn('foo', msgids) self.assertNotIn('bar', msgids) def test_calls_in_fstring_with_keyword_args(self): msgids = self.extract_docstrings_from_str(dedent('''\ f"{_('foo', bar='baz')}" ''')) - self.assertNotIn('foo', msgids) + self.assertIn('foo', msgids) self.assertNotIn('bar', msgids) self.assertNotIn('baz', msgids) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 0d16e8f7da0071..044c34de8432fb 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -163,16 +163,13 @@ import time import getopt import ast -import token import tokenize +from collections import defaultdict +from dataclasses import dataclass, field +from operator import attrgetter, itemgetter __version__ = '1.5' -default_keywords = ['_'] -DEFAULTKEYWORDS = ', '.join(default_keywords) - -EMPTYSTRING = '' - # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's # there. @@ -306,12 +303,64 @@ def getFilesForName(name): return [] +# Key is the function name, value is a dictionary mapping argument positions to the +# type of the argument. The type is one of 'msgid', 'msgid_plural', or 'msgctxt'. +DEFAULTKEYWORDS = { + '_': {0: 'msgid'}, + 'gettext': {0: 'msgid'}, + 'ngettext': {0: 'msgid', 1: 'msgid_plural'}, + 'pgettext': {0: 'msgctxt', 1: 'msgid'}, + 'npgettext': {0: 'msgctxt', 1: 'msgid', 2: 'msgid_plural'}, + 'dgettext': {1: 'msgid'}, + 'dngettext': {1: 'msgid', 2: 'msgid_plural'}, + 'dpgettext': {1: 'msgctxt', 2: 'msgid'}, + 'dnpgettext': {1: 'msgctxt', 2: 'msgid', 3: 'msgid_plural'}, +} + + +def matches_spec(message, spec): + """Check if a message has all the keys defined by the keyword spec.""" + return all(key in message for key in spec.values()) + + +@dataclass(frozen=True) +class Location: + filename: str + lineno: int + + def __lt__(self, other): + return (self.filename, self.lineno) < (other.filename, other.lineno) + + +@dataclass +class Message: + msgid: str + msgid_plural: str | None + msgctxt: str | None + locations: set[Location] = field(default_factory=set) + is_docstring: bool = False + + def add_location(self, filename, lineno, msgid_plural=None, *, is_docstring=False): + if self.msgid_plural is None: + self.msgid_plural = msgid_plural + self.locations.add(Location(filename, lineno)) + self.is_docstring |= is_docstring + + +def key_for(msgid, msgctxt=None): + if msgctxt is not None: + return (msgctxt, msgid) + return msgid + + class TokenEater: def __init__(self, options): self.__options = options self.__messages = {} self.__state = self.__waiting - self.__data = [] + self.__data = defaultdict(str) + self.__curr_arg = 0 + self.__curr_keyword = None self.__lineno = -1 self.__freshmodule = 1 self.__curfile = None @@ -331,7 +380,7 @@ def __waiting(self, ttype, tstring, lineno): # module docstring? if self.__freshmodule: if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True) self.__freshmodule = 0 return if ttype in (tokenize.COMMENT, tokenize.NL, tokenize.ENCODING): @@ -346,6 +395,7 @@ def __waiting(self, ttype, tstring, lineno): return if ttype == tokenize.NAME and tstring in opts.keywords: self.__state = self.__keywordseen + self.__curr_keyword = tstring return if ttype == tokenize.STRING: maybe_fstring = ast.parse(tstring, mode='eval').body @@ -397,7 +447,8 @@ def __waiting(self, ttype, tstring, lineno): }, file=sys.stderr) continue if isinstance(arg.value, str): - self.__addentry(arg.value, lineno) + self.__curr_keyword = func_name + self.__addentry({'msgid': arg.value}, lineno) def __suiteseen(self, ttype, tstring, lineno): # skip over any enclosure pairs until we see the colon @@ -413,7 +464,7 @@ def __suiteseen(self, ttype, tstring, lineno): def __suitedocstring(self, ttype, tstring, lineno): # ignore any intervening noise if ttype == tokenize.STRING and is_literal_string(tstring): - self.__addentry(safe_eval(tstring), lineno, isdocstring=1) + self.__addentry({'msgid': safe_eval(tstring)}, lineno, is_docstring=True) self.__state = self.__waiting elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, tokenize.COMMENT): @@ -422,7 +473,8 @@ def __suitedocstring(self, ttype, tstring, lineno): def __keywordseen(self, ttype, tstring, lineno): if ttype == tokenize.OP and tstring == '(': - self.__data = [] + self.__data.clear() + self.__curr_arg = 0 self.__lineno = lineno self.__state = self.__openseen else: @@ -435,31 +487,50 @@ def __openseen(self, ttype, tstring, lineno): # of messages seen. Reset state for the next batch. If there # were no strings inside _(), then just ignore this entry. if self.__data: - self.__addentry(EMPTYSTRING.join(self.__data)) + self.__addentry(self.__data) self.__state = self.__waiting elif ttype == tokenize.STRING and is_literal_string(tstring): - self.__data.append(safe_eval(tstring)) - elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, - token.NEWLINE, tokenize.NL]: - # warn if we see anything else than STRING or whitespace - print(_( - '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' - ) % { - 'token': tstring, - 'file': self.__curfile, - 'lineno': self.__lineno - }, file=sys.stderr) - self.__state = self.__waiting + spec = self.__options.keywords[self.__curr_keyword] + arg_type = spec.get(self.__curr_arg) + if arg_type is None: + return + string = safe_eval(tstring) + self.__data[arg_type] += string + elif ttype == tokenize.OP and tstring == ',': + # Advance to the next argument + self.__curr_arg += 1 def __ignorenext(self, ttype, tstring, lineno): self.__state = self.__waiting - def __addentry(self, msg, lineno=None, isdocstring=0): + def __addentry(self, msg, lineno=None, *, is_docstring=False): + msgid = msg.get('msgid') + if msgid in self.__options.toexclude: + return + if not is_docstring: + spec = self.__options.keywords[self.__curr_keyword] + if not matches_spec(msg, spec): + return if lineno is None: lineno = self.__lineno - if not msg in self.__options.toexclude: - entry = (self.__curfile, lineno) - self.__messages.setdefault(msg, {})[entry] = isdocstring + msgctxt = msg.get('msgctxt') + msgid_plural = msg.get('msgid_plural') + key = key_for(msgid, msgctxt) + if key in self.__messages: + self.__messages[key].add_location( + self.__curfile, + lineno, + msgid_plural, + is_docstring=is_docstring, + ) + else: + self.__messages[key] = Message( + msgid=msgid, + msgid_plural=msgid_plural, + msgctxt=msgctxt, + locations={Location(self.__curfile, lineno)}, + is_docstring=is_docstring, + ) def set_filename(self, filename): self.__curfile = filename @@ -472,55 +543,54 @@ def write(self, fp): print(pot_header % {'time': timestamp, 'version': __version__, 'charset': encoding, 'encoding': '8bit'}, file=fp) - # Sort the entries. First sort each particular entry's keys, then - # sort all the entries by their first item. - reverse = {} - for k, v in self.__messages.items(): - keys = sorted(v.keys()) - reverse.setdefault(tuple(keys), []).append((k, v)) - rkeys = sorted(reverse.keys()) - for rkey in rkeys: - rentries = reverse[rkey] - rentries.sort() - for k, v in rentries: - # If the entry was gleaned out of a docstring, then add a - # comment stating so. This is to aid translators who may wish - # to skip translating some unimportant docstrings. - isdocstring = any(v.values()) - # k is the message string, v is a dictionary-set of (filename, - # lineno) tuples. We want to sort the entries in v first by - # file name and then by line number. - v = sorted(v.keys()) - if not options.writelocations: - pass + + # Sort locations within each message by filename and lineno + sorted_keys = [ + (key, sorted(msg.locations, key=attrgetter('filename', 'lineno'))) + for key, msg in self.__messages.items() + ] + # Sort messages by locations + # For example, a message with locations [('test.py', 1), ('test.py', 2)] will + # appear before a message with locations [('test.py', 1), ('test.py', 3)] + sorted_keys.sort(key=itemgetter(1)) + + for key, locations in sorted_keys: + msg = self.__messages[key] + if options.writelocations: # location comments are different b/w Solaris and GNU: - elif options.locationstyle == options.SOLARIS: - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - print(_( - '# File: %(filename)s, line: %(lineno)d') % d, file=fp) + if options.locationstyle == options.SOLARIS: + for location in locations: + print(f'# File: {location.filename}, line: {location.lineno}', file=fp) elif options.locationstyle == options.GNU: # fit as many locations on one line, as long as the # resulting line length doesn't exceed 'options.width' locline = '#:' - for filename, lineno in v: - d = {'filename': filename, 'lineno': lineno} - s = _(' %(filename)s:%(lineno)d') % d + for location in locations: + s = f' {location.filename}:{location.lineno}' if len(locline) + len(s) <= options.width: locline = locline + s else: print(locline, file=fp) - locline = "#:" + s + locline = f'#: {s}' if len(locline) > 2: print(locline, file=fp) - if isdocstring: - print('#, docstring', file=fp) - print('msgid', normalize(k, encoding), file=fp) + if msg.is_docstring: + # If the entry was gleaned out of a docstring, then add a + # comment stating so. This is to aid translators who may wish + # to skip translating some unimportant docstrings. + print('#, docstring', file=fp) + if msg.msgctxt is not None: + print('msgctxt', normalize(msg.msgctxt, encoding), file=fp) + print('msgid', normalize(msg.msgid, encoding), file=fp) + if msg.msgid_plural is not None: + print('msgid_plural', normalize(msg.msgid_plural, encoding), file=fp) + print('msgstr[0] ""', file=fp) + print('msgstr[1] ""\n', file=fp) + else: print('msgstr ""\n', file=fp) def main(): - global default_keywords try: opts, args = getopt.getopt( sys.argv[1:], @@ -557,7 +627,7 @@ class Options: locations = {'gnu' : options.GNU, 'solaris' : options.SOLARIS, } - + no_default_keywords = False # parse options for opt, arg in opts: if opt in ('-h', '--help'): @@ -573,7 +643,7 @@ class Options: elif opt in ('-k', '--keyword'): options.keywords.append(arg) elif opt in ('-K', '--no-default-keywords'): - default_keywords = [] + no_default_keywords = True elif opt in ('-n', '--add-location'): options.writelocations = 1 elif opt in ('--no-location',): @@ -613,7 +683,9 @@ class Options: make_escapes(not options.escape) # calculate all keywords - options.keywords.extend(default_keywords) + options.keywords = {kw: {0: 'msgid'} for kw in options.keywords} + if not no_default_keywords: + options.keywords |= DEFAULTKEYWORDS # initialize list of strings to exclude if options.excludefilename: From 06686c9476c71a0b9fa245cc4ef7f705d1c61224 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sat, 16 Nov 2024 20:42:39 +0100 Subject: [PATCH 02/14] Update snapshots --- Lib/test/translationdata/argparse/msgids.txt | 2 ++ Lib/test/translationdata/optparse/msgids.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/Lib/test/translationdata/argparse/msgids.txt b/Lib/test/translationdata/argparse/msgids.txt index 2b012906436e85..ae89ac74726ecf 100644 --- a/Lib/test/translationdata/argparse/msgids.txt +++ b/Lib/test/translationdata/argparse/msgids.txt @@ -8,6 +8,8 @@ argument %(argument_name)s: %(message)s argument '%(argument_name)s' is deprecated can't open '%(filename)s': %(error)s command '%(parser_name)s' is deprecated +conflicting option string: %s +expected %s argument expected at least one argument expected at most one argument expected one argument diff --git a/Lib/test/translationdata/optparse/msgids.txt b/Lib/test/translationdata/optparse/msgids.txt index ac5317c736af8c..8f405a2bf26dbe 100644 --- a/Lib/test/translationdata/optparse/msgids.txt +++ b/Lib/test/translationdata/optparse/msgids.txt @@ -1,3 +1,4 @@ +%(option)s option requires %(number)d argument %prog [options] %s option does not take a value Options From e5c27462084c4e5623c91e3947ec44ce3ed93b3e Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sat, 16 Nov 2024 20:43:06 +0100 Subject: [PATCH 03/14] Bump pygettext version --- Lib/test/test_tools/i18n_data/docstrings.pot | 2 +- Lib/test/test_tools/i18n_data/fileloc.pot | 2 +- Lib/test/test_tools/i18n_data/messages.pot | 2 +- Tools/i18n/pygettext.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/docstrings.pot b/Lib/test/test_tools/i18n_data/docstrings.pot index 5af1d41422ff62..e7bd5e3c457470 100644 --- a/Lib/test/test_tools/i18n_data/docstrings.pot +++ b/Lib/test/test_tools/i18n_data/docstrings.pot @@ -12,7 +12,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: pygettext.py 1.5\n" +"Generated-By: pygettext.py 1.6\n" #: docstrings.py:7 diff --git a/Lib/test/test_tools/i18n_data/fileloc.pot b/Lib/test/test_tools/i18n_data/fileloc.pot index dbd28687a73556..2df93fa273b4e4 100644 --- a/Lib/test/test_tools/i18n_data/fileloc.pot +++ b/Lib/test/test_tools/i18n_data/fileloc.pot @@ -12,7 +12,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: pygettext.py 1.5\n" +"Generated-By: pygettext.py 1.6\n" #: fileloc.py:5 fileloc.py:6 diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index 00b49ebb6d17d9..6a00f9acdb0b07 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -12,7 +12,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: pygettext.py 1.5\n" +"Generated-By: pygettext.py 1.6\n" #: messages.py:16 diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 044c34de8432fb..857b79ec44a0a1 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -168,7 +168,7 @@ from dataclasses import dataclass, field from operator import attrgetter, itemgetter -__version__ = '1.5' +__version__ = '1.6' # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's From 511a0c0fcd56ee7e21d0d5c70d4d420719dc9d70 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sat, 16 Nov 2024 20:47:26 +0100 Subject: [PATCH 04/14] Add news entry --- .../Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst diff --git a/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst b/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst new file mode 100644 index 00000000000000..2cb961466ed738 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst @@ -0,0 +1 @@ +Add support for multi-argument gettext functions in :program:`pygettext.py`. From 62d6455f2c13a64347323bc4b0d2e0db07b417b3 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sun, 17 Nov 2024 11:10:17 +0100 Subject: [PATCH 05/14] Correctly count enclosures --- Lib/test/test_tools/i18n_data/messages.pot | 5 +++++ Lib/test/test_tools/i18n_data/messages.py | 4 ++++ Tools/i18n/pygettext.py | 14 ++++++++++---- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index 6a00f9acdb0b07..b6235699c62c3d 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -38,6 +38,7 @@ msgid "kwargs work!" msgstr "" #: messages.py:38 messages.py:81 messages.py:82 messages.py:85 messages.py:86 +#: messages.py:91 msgid "foo" msgid_plural "foos" msgstr[0] "" @@ -90,3 +91,7 @@ msgid_plural "foos" msgstr[0] "" msgstr[1] "" +#: messages.py:92 +msgid "domain foo" +msgstr "" + diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index 02643fed57ed2e..6c01ef348d64ac 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -87,6 +87,10 @@ def _(x="don't extract me"): dpgettext("domain", "context", "foo") dnpgettext("domain", "context", "foo", "foos", 1) +# Complex arguments +ngettext("foo", "foos", 42 + (10 - 20)) +dgettext(["some", {"complex"}, ("argument",)], "domain foo") + # Invalid calls which are not extracted gettext() ngettext('foo') diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 857b79ec44a0a1..bdf8e8c9120d1c 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -475,13 +475,14 @@ def __keywordseen(self, ttype, tstring, lineno): if ttype == tokenize.OP and tstring == '(': self.__data.clear() self.__curr_arg = 0 + self.__enclosurecount = 0 self.__lineno = lineno self.__state = self.__openseen else: self.__state = self.__waiting def __openseen(self, ttype, tstring, lineno): - if ttype == tokenize.OP and tstring == ')': + if ttype == tokenize.OP and tstring == ')' and self.__enclosurecount == 0: # We've seen the last of the translatable strings. Record the # line number of the first line of the strings and update the list # of messages seen. Reset state for the next batch. If there @@ -496,9 +497,14 @@ def __openseen(self, ttype, tstring, lineno): return string = safe_eval(tstring) self.__data[arg_type] += string - elif ttype == tokenize.OP and tstring == ',': - # Advance to the next argument - self.__curr_arg += 1 + elif ttype == tokenize.OP: + if tstring == ',' and self.__enclosurecount == 0: + # Advance to the next argument + self.__curr_arg += 1 + elif tstring in '([{': + self.__enclosurecount += 1 + elif tstring in ')]}': + self.__enclosurecount -= 1 def __ignorenext(self, ttype, tstring, lineno): self.__state = self.__waiting From 496f5d93a9c891867010f108b58bb241449c3db1 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sun, 17 Nov 2024 11:31:08 +0100 Subject: [PATCH 06/14] Restore warnings for invalid arguments --- Lib/test/test_tools/i18n_data/messages.pot | 34 +++++-------- Lib/test/test_tools/i18n_data/messages.py | 4 +- Tools/i18n/pygettext.py | 55 +++++++++++++++------- 3 files changed, 54 insertions(+), 39 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index b6235699c62c3d..6e32a275ec3821 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -33,65 +33,57 @@ msgid "" " multiline!\n" msgstr "" -#: messages.py:37 -msgid "kwargs work!" -msgstr "" - -#: messages.py:38 messages.py:81 messages.py:82 messages.py:85 messages.py:86 -#: messages.py:91 +#: messages.py:40 messages.py:83 messages.py:84 messages.py:87 messages.py:88 +#: messages.py:93 msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:39 +#: messages.py:41 msgid "something" msgstr "" -#: messages.py:42 +#: messages.py:44 msgid "Hello, {}!" msgstr "" -#: messages.py:43 -msgid "Hello, {}!world" -msgstr "" - -#: messages.py:46 +#: messages.py:48 msgid "1" msgstr "" -#: messages.py:46 +#: messages.py:48 msgid "2" msgstr "" -#: messages.py:47 messages.py:48 +#: messages.py:49 messages.py:50 msgid "A" msgstr "" -#: messages.py:47 messages.py:48 +#: messages.py:49 messages.py:50 msgid "B" msgstr "" -#: messages.py:49 +#: messages.py:51 msgid "set" msgstr "" -#: messages.py:55 +#: messages.py:57 msgid "nested string" msgstr "" -#: messages.py:60 +#: messages.py:62 msgid "baz" msgstr "" -#: messages.py:83 messages.py:84 messages.py:87 messages.py:88 +#: messages.py:85 messages.py:86 messages.py:89 messages.py:90 msgctxt "context" msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:92 +#: messages.py:94 msgid "domain foo" msgstr "" diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index 6c01ef348d64ac..df28ab471a4599 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -33,8 +33,10 @@ _(1) _(False) -# Unusual, but valid arguments +# pygettext does not allow keyword arguments, but both xgettext and pybabel do _(x="kwargs work!") + +# Unusual, but valid arguments _("foo", "bar") _("something", x="something else") diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index bdf8e8c9120d1c..5ed1366cc70d12 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -482,29 +482,41 @@ def __keywordseen(self, ttype, tstring, lineno): self.__state = self.__waiting def __openseen(self, ttype, tstring, lineno): - if ttype == tokenize.OP and tstring == ')' and self.__enclosurecount == 0: - # We've seen the last of the translatable strings. Record the - # line number of the first line of the strings and update the list - # of messages seen. Reset state for the next batch. If there - # were no strings inside _(), then just ignore this entry. - if self.__data: - self.__addentry(self.__data) - self.__state = self.__waiting - elif ttype == tokenize.STRING and is_literal_string(tstring): - spec = self.__options.keywords[self.__curr_keyword] - arg_type = spec.get(self.__curr_arg) - if arg_type is None: - return - string = safe_eval(tstring) - self.__data[arg_type] += string - elif ttype == tokenize.OP: - if tstring == ',' and self.__enclosurecount == 0: + spec = self.__options.keywords[self.__curr_keyword] + arg_type = spec.get(self.__curr_arg) + expect_string_literal = arg_type is not None + + if ttype == tokenize.OP: + if tstring == ')' and self.__enclosurecount == 0: + # We've seen the last of the translatable strings. Record the + # line number of the first line of the strings and update the list + # of messages seen. Reset state for the next batch. If there + # were no strings inside _(), then just ignore this entry. + if self.__data: + self.__addentry(self.__data) + self.__state = self.__waiting + elif tstring == ',' and self.__enclosurecount == 0: # Advance to the next argument self.__curr_arg += 1 elif tstring in '([{': self.__enclosurecount += 1 elif tstring in ')]}': self.__enclosurecount -= 1 + elif expect_string_literal: + # We are inside an argument which is a translatable string and + # we encountered a token that is not a string. This is an error. + self.warn_unexpected_token(tstring) + self.__enclosurecount = 0 + self.__state = self.__waiting + elif expect_string_literal: + if ttype == tokenize.STRING and is_literal_string(tstring): + self.__data[arg_type] += safe_eval(tstring) + elif ttype not in (tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT, + tokenize.NEWLINE, tokenize.NL): + self.warn_unexpected_token(tstring) + self.__enclosurecount = 0 + self.__state = self.__waiting + def __ignorenext(self, ttype, tstring, lineno): self.__state = self.__waiting @@ -538,6 +550,15 @@ def __addentry(self, msg, lineno=None, *, is_docstring=False): is_docstring=is_docstring, ) + def warn_unexpected_token(self, token): + print(_( + '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' + ) % { + 'token': token, + 'file': self.__curfile, + 'lineno': self.__lineno + }, file=sys.stderr) + def set_filename(self, filename): self.__curfile = filename self.__freshmodule = 1 From 06186a00110a15d418c0bdec7475c4934a751d4f Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sun, 17 Nov 2024 11:59:36 +0100 Subject: [PATCH 07/14] Remove extra space --- Lib/test/test_tools/i18n_data/messages.pot | 2 +- Tools/i18n/pygettext.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index 6e32a275ec3821..61e1cf2446d5b8 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -34,7 +34,7 @@ msgid "" msgstr "" #: messages.py:40 messages.py:83 messages.py:84 messages.py:87 messages.py:88 -#: messages.py:93 +#: messages.py:93 msgid "foo" msgid_plural "foos" msgstr[0] "" diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 5ed1366cc70d12..e4d7edfe33f33a 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -598,7 +598,7 @@ def write(self, fp): locline = locline + s else: print(locline, file=fp) - locline = f'#: {s}' + locline = f'#:{s}' if len(locline) > 2: print(locline, file=fp) if msg.is_docstring: From 3d67a7ac5ec5771bb34a3901d7a0ae73fe452b2f Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Sun, 17 Nov 2024 13:39:51 +0100 Subject: [PATCH 08/14] Simplify code --- Tools/i18n/pygettext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index e4d7edfe33f33a..33d0169fd176e9 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -166,7 +166,7 @@ import tokenize from collections import defaultdict from dataclasses import dataclass, field -from operator import attrgetter, itemgetter +from operator import itemgetter __version__ = '1.6' @@ -573,7 +573,7 @@ def write(self, fp): # Sort locations within each message by filename and lineno sorted_keys = [ - (key, sorted(msg.locations, key=attrgetter('filename', 'lineno'))) + (key, sorted(msg.locations)) for key, msg in self.__messages.items() ] # Sort messages by locations From 48070d52486737e70d2e0557f0c9f3cd6186b6f7 Mon Sep 17 00:00:00 2001 From: "Tomas R." Date: Sun, 17 Nov 2024 16:44:16 +0100 Subject: [PATCH 09/14] Update comment --- Lib/test/test_tools/i18n_data/messages.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index df28ab471a4599..28732a2f352426 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -42,7 +42,7 @@ # .format() _("Hello, {}!").format("world") # valid -_("Hello, {}!".format("world")) # should be invalid, but is extracted (also by xgettext and pybabel) +_("Hello, {}!".format("world")) # invalid, but xgettext and pybabel extract the first string # Nested structures _("1"), _("2") From d6fd789c15a575b46b104a0492b67304f3d66655 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Mon, 18 Nov 2024 18:27:35 +0100 Subject: [PATCH 10/14] Only extract when __enclosure_count is 0 --- Lib/test/test_tools/i18n_data/messages.pot | 26 +++++++++++----------- Lib/test/test_tools/i18n_data/messages.py | 3 +++ Tools/i18n/pygettext.py | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index 61e1cf2446d5b8..a5243b287cb4de 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -33,57 +33,57 @@ msgid "" " multiline!\n" msgstr "" -#: messages.py:40 messages.py:83 messages.py:84 messages.py:87 messages.py:88 -#: messages.py:93 +#: messages.py:43 messages.py:86 messages.py:87 messages.py:90 messages.py:91 +#: messages.py:96 msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:41 +#: messages.py:44 msgid "something" msgstr "" -#: messages.py:44 +#: messages.py:47 msgid "Hello, {}!" msgstr "" -#: messages.py:48 +#: messages.py:51 msgid "1" msgstr "" -#: messages.py:48 +#: messages.py:51 msgid "2" msgstr "" -#: messages.py:49 messages.py:50 +#: messages.py:52 messages.py:53 msgid "A" msgstr "" -#: messages.py:49 messages.py:50 +#: messages.py:52 messages.py:53 msgid "B" msgstr "" -#: messages.py:51 +#: messages.py:54 msgid "set" msgstr "" -#: messages.py:57 +#: messages.py:60 msgid "nested string" msgstr "" -#: messages.py:62 +#: messages.py:65 msgid "baz" msgstr "" -#: messages.py:85 messages.py:86 messages.py:89 messages.py:90 +#: messages.py:88 messages.py:89 messages.py:92 messages.py:93 msgctxt "context" msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:94 +#: messages.py:97 msgid "domain foo" msgstr "" diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index 28732a2f352426..e7d974878a16d0 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -32,6 +32,9 @@ _(None) _(1) _(False) +_(("invalid")) +_(["invalid"]) +_({"invalid"}) # pygettext does not allow keyword arguments, but both xgettext and pybabel do _(x="kwargs work!") diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 33d0169fd176e9..b8f1680efad85b 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -508,7 +508,7 @@ def __openseen(self, ttype, tstring, lineno): self.warn_unexpected_token(tstring) self.__enclosurecount = 0 self.__state = self.__waiting - elif expect_string_literal: + elif expect_string_literal and self.__enclosurecount == 0: if ttype == tokenize.STRING and is_literal_string(tstring): self.__data[arg_type] += safe_eval(tstring) elif ttype not in (tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT, From 34976907f78b7f6e4ea6c988778fa79681c3a92d Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Mon, 18 Nov 2024 18:30:49 +0100 Subject: [PATCH 11/14] Keep the old version --- Tools/i18n/pygettext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index b8f1680efad85b..2b77d2251ed4d8 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -168,7 +168,7 @@ from dataclasses import dataclass, field from operator import itemgetter -__version__ = '1.6' +__version__ = '1.5' # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's From 192187e14626818edd3f02fd0da982831671bd86 Mon Sep 17 00:00:00 2001 From: "Tomas R." Date: Mon, 18 Nov 2024 18:31:50 +0100 Subject: [PATCH 12/14] Improve news entry Co-authored-by: Serhiy Storchaka --- .../Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst b/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst index 2cb961466ed738..c08ad9d7059904 100644 --- a/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst +++ b/Misc/NEWS.d/next/Tools-Demos/2024-11-16-20-47-20.gh-issue-126700.ayrHv4.rst @@ -1 +1 @@ -Add support for multi-argument gettext functions in :program:`pygettext.py`. +Add support for multi-argument :mod:`gettext` functions in :program:`pygettext.py`. From 9cfc901632680090b5f6f24a577aff095371572b Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Mon, 18 Nov 2024 18:41:39 +0100 Subject: [PATCH 13/14] Update snapshots --- Lib/test/test_tools/i18n_data/docstrings.pot | 2 +- Lib/test/test_tools/i18n_data/fileloc.pot | 2 +- Lib/test/test_tools/i18n_data/messages.pot | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/docstrings.pot b/Lib/test/test_tools/i18n_data/docstrings.pot index e7bd5e3c457470..5af1d41422ff62 100644 --- a/Lib/test/test_tools/i18n_data/docstrings.pot +++ b/Lib/test/test_tools/i18n_data/docstrings.pot @@ -12,7 +12,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: pygettext.py 1.6\n" +"Generated-By: pygettext.py 1.5\n" #: docstrings.py:7 diff --git a/Lib/test/test_tools/i18n_data/fileloc.pot b/Lib/test/test_tools/i18n_data/fileloc.pot index 2df93fa273b4e4..dbd28687a73556 100644 --- a/Lib/test/test_tools/i18n_data/fileloc.pot +++ b/Lib/test/test_tools/i18n_data/fileloc.pot @@ -12,7 +12,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: pygettext.py 1.6\n" +"Generated-By: pygettext.py 1.5\n" #: fileloc.py:5 fileloc.py:6 diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index a5243b287cb4de..7eff47ac6525fd 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -12,7 +12,7 @@ msgstr "" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Generated-By: pygettext.py 1.6\n" +"Generated-By: pygettext.py 1.5\n" #: messages.py:16 From 24851c5544cafcf7aec5f4326d67548aaaeb09c0 Mon Sep 17 00:00:00 2001 From: Tomas Roun Date: Thu, 21 Nov 2024 22:47:51 +0100 Subject: [PATCH 14/14] Refactor __openseen --- Lib/test/test_tools/i18n_data/messages.pot | 26 +++++++++---------- Lib/test/test_tools/i18n_data/messages.py | 3 +++ Tools/i18n/pygettext.py | 29 +++++++++++----------- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/Lib/test/test_tools/i18n_data/messages.pot b/Lib/test/test_tools/i18n_data/messages.pot index 7eff47ac6525fd..8d66fbc4f3a937 100644 --- a/Lib/test/test_tools/i18n_data/messages.pot +++ b/Lib/test/test_tools/i18n_data/messages.pot @@ -33,57 +33,57 @@ msgid "" " multiline!\n" msgstr "" -#: messages.py:43 messages.py:86 messages.py:87 messages.py:90 messages.py:91 -#: messages.py:96 +#: messages.py:46 messages.py:89 messages.py:90 messages.py:93 messages.py:94 +#: messages.py:99 msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:44 +#: messages.py:47 msgid "something" msgstr "" -#: messages.py:47 +#: messages.py:50 msgid "Hello, {}!" msgstr "" -#: messages.py:51 +#: messages.py:54 msgid "1" msgstr "" -#: messages.py:51 +#: messages.py:54 msgid "2" msgstr "" -#: messages.py:52 messages.py:53 +#: messages.py:55 messages.py:56 msgid "A" msgstr "" -#: messages.py:52 messages.py:53 +#: messages.py:55 messages.py:56 msgid "B" msgstr "" -#: messages.py:54 +#: messages.py:57 msgid "set" msgstr "" -#: messages.py:60 +#: messages.py:63 msgid "nested string" msgstr "" -#: messages.py:65 +#: messages.py:68 msgid "baz" msgstr "" -#: messages.py:88 messages.py:89 messages.py:92 messages.py:93 +#: messages.py:91 messages.py:92 messages.py:95 messages.py:96 msgctxt "context" msgid "foo" msgid_plural "foos" msgstr[0] "" msgstr[1] "" -#: messages.py:97 +#: messages.py:100 msgid "domain foo" msgstr "" diff --git a/Lib/test/test_tools/i18n_data/messages.py b/Lib/test/test_tools/i18n_data/messages.py index e7d974878a16d0..1e03f4e556830d 100644 --- a/Lib/test/test_tools/i18n_data/messages.py +++ b/Lib/test/test_tools/i18n_data/messages.py @@ -35,6 +35,9 @@ _(("invalid")) _(["invalid"]) _({"invalid"}) +_("string"[3]) +_("string"[:3]) +_({"string": "foo"}) # pygettext does not allow keyword arguments, but both xgettext and pybabel do _(x="kwargs work!") diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index 2b77d2251ed4d8..f78ff16bff9039 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -486,8 +486,8 @@ def __openseen(self, ttype, tstring, lineno): arg_type = spec.get(self.__curr_arg) expect_string_literal = arg_type is not None - if ttype == tokenize.OP: - if tstring == ')' and self.__enclosurecount == 0: + if ttype == tokenize.OP and self.__enclosurecount == 0: + if tstring == ')': # We've seen the last of the translatable strings. Record the # line number of the first line of the strings and update the list # of messages seen. Reset state for the next batch. If there @@ -495,28 +495,27 @@ def __openseen(self, ttype, tstring, lineno): if self.__data: self.__addentry(self.__data) self.__state = self.__waiting - elif tstring == ',' and self.__enclosurecount == 0: + return + elif tstring == ',': # Advance to the next argument self.__curr_arg += 1 - elif tstring in '([{': - self.__enclosurecount += 1 - elif tstring in ')]}': - self.__enclosurecount -= 1 - elif expect_string_literal: - # We are inside an argument which is a translatable string and - # we encountered a token that is not a string. This is an error. - self.warn_unexpected_token(tstring) - self.__enclosurecount = 0 - self.__state = self.__waiting - elif expect_string_literal and self.__enclosurecount == 0: + return + + if expect_string_literal: if ttype == tokenize.STRING and is_literal_string(tstring): self.__data[arg_type] += safe_eval(tstring) elif ttype not in (tokenize.COMMENT, tokenize.INDENT, tokenize.DEDENT, tokenize.NEWLINE, tokenize.NL): + # We are inside an argument which is a translatable string and + # we encountered a token that is not a string. This is an error. self.warn_unexpected_token(tstring) self.__enclosurecount = 0 self.__state = self.__waiting - + elif ttype == tokenize.OP: + if tstring in '([{': + self.__enclosurecount += 1 + elif tstring in ')]}': + self.__enclosurecount -= 1 def __ignorenext(self, ttype, tstring, lineno): self.__state = self.__waiting