Skip to content

Commit 08ccbb9

Browse files
[3.13] gh-52551: Fix encoding issues in strftime() (GH-125193) (GH-125657)
Fix time.strftime(), the strftime() method and formatting of the datetime classes datetime, date and time. * Characters not encodable in the current locale are now acceptable in the format string. * Surrogate pairs and sequence of surrogatescape-encoded bytes are no longer recombinated. * Embedded null character no longer terminates the format string. This fixes also gh-78662 and gh-124531. (cherry picked from commit ad3eac1)
1 parent d894d46 commit 08ccbb9

File tree

5 files changed

+293
-211
lines changed

5 files changed

+293
-211
lines changed

Lib/test/datetimetester.py

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2855,11 +2855,32 @@ def test_more_strftime(self):
28552855
self.assertEqual(t.strftime("%z"), "-0200" + z)
28562856
self.assertEqual(t.strftime("%:z"), "-02:00:" + z)
28572857

2858-
# bpo-34482: Check that surrogates don't cause a crash.
2859-
try:
2860-
t.strftime('%y\ud800%m %H\ud800%M')
2861-
except UnicodeEncodeError:
2862-
pass
2858+
def test_strftime_special(self):
2859+
t = self.theclass(2004, 12, 31, 6, 22, 33, 47)
2860+
s1 = t.strftime('%c')
2861+
s2 = t.strftime('%B')
2862+
# gh-52551, gh-78662: Unicode strings should pass through strftime,
2863+
# independently from locale.
2864+
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
2865+
self.assertEqual(t.strftime('\U0001f4bb%c\U0001f40d%B'), f'\U0001f4bb{s1}\U0001f40d{s2}')
2866+
self.assertEqual(t.strftime('%c\U0001f4bb%B\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
2867+
# Lone surrogates should pass through.
2868+
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
2869+
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
2870+
self.assertEqual(t.strftime('\ud83d%c\udc0d%B'), f'\ud83d{s1}\udc0d{s2}')
2871+
self.assertEqual(t.strftime('%c\ud83d%B\udc0d'), f'{s1}\ud83d{s2}\udc0d')
2872+
self.assertEqual(t.strftime('%c\udc0d%B\ud83d'), f'{s1}\udc0d{s2}\ud83d')
2873+
# Surrogate pairs should not recombine.
2874+
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
2875+
self.assertEqual(t.strftime('%c\ud83d\udc0d%B'), f'{s1}\ud83d\udc0d{s2}')
2876+
# Surrogate-escaped bytes should not recombine.
2877+
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
2878+
self.assertEqual(t.strftime('%c\udcf0\udc9f\udc90\udc8d%B'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
2879+
# gh-124531: The null character should not terminate the format string.
2880+
self.assertEqual(t.strftime('\0'), '\0')
2881+
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
2882+
self.assertEqual(t.strftime('\0%c\0%B'), f'\0{s1}\0{s2}')
2883+
self.assertEqual(t.strftime('%c\0%B\0'), f'{s1}\0{s2}\0')
28632884

28642885
def test_extract(self):
28652886
dt = self.theclass(2002, 3, 4, 18, 45, 3, 1234)
@@ -3633,6 +3654,33 @@ def test_strftime(self):
36333654
# gh-85432: The parameter was named "fmt" in the pure-Python impl.
36343655
t.strftime(format="%f")
36353656

3657+
def test_strftime_special(self):
3658+
t = self.theclass(1, 2, 3, 4)
3659+
s1 = t.strftime('%I%p%Z')
3660+
s2 = t.strftime('%X')
3661+
# gh-52551, gh-78662: Unicode strings should pass through strftime,
3662+
# independently from locale.
3663+
self.assertEqual(t.strftime('\U0001f40d'), '\U0001f40d')
3664+
self.assertEqual(t.strftime('\U0001f4bb%I%p%Z\U0001f40d%X'), f'\U0001f4bb{s1}\U0001f40d{s2}')
3665+
self.assertEqual(t.strftime('%I%p%Z\U0001f4bb%X\U0001f40d'), f'{s1}\U0001f4bb{s2}\U0001f40d')
3666+
# Lone surrogates should pass through.
3667+
self.assertEqual(t.strftime('\ud83d'), '\ud83d')
3668+
self.assertEqual(t.strftime('\udc0d'), '\udc0d')
3669+
self.assertEqual(t.strftime('\ud83d%I%p%Z\udc0d%X'), f'\ud83d{s1}\udc0d{s2}')
3670+
self.assertEqual(t.strftime('%I%p%Z\ud83d%X\udc0d'), f'{s1}\ud83d{s2}\udc0d')
3671+
self.assertEqual(t.strftime('%I%p%Z\udc0d%X\ud83d'), f'{s1}\udc0d{s2}\ud83d')
3672+
# Surrogate pairs should not recombine.
3673+
self.assertEqual(t.strftime('\ud83d\udc0d'), '\ud83d\udc0d')
3674+
self.assertEqual(t.strftime('%I%p%Z\ud83d\udc0d%X'), f'{s1}\ud83d\udc0d{s2}')
3675+
# Surrogate-escaped bytes should not recombine.
3676+
self.assertEqual(t.strftime('\udcf0\udc9f\udc90\udc8d'), '\udcf0\udc9f\udc90\udc8d')
3677+
self.assertEqual(t.strftime('%I%p%Z\udcf0\udc9f\udc90\udc8d%X'), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
3678+
# gh-124531: The null character should not terminate the format string.
3679+
self.assertEqual(t.strftime('\0'), '\0')
3680+
self.assertEqual(t.strftime('\0'*1000), '\0'*1000)
3681+
self.assertEqual(t.strftime('\0%I%p%Z\0%X'), f'\0{s1}\0{s2}')
3682+
self.assertEqual(t.strftime('%I%p%Z\0%X\0'), f'{s1}\0{s2}\0')
3683+
36363684
def test_format(self):
36373685
t = self.theclass(1, 2, 3, 4)
36383686
self.assertEqual(t.__format__(''), str(t))
@@ -4084,9 +4132,8 @@ def tzname(self, dt): return self.tz
40844132
self.assertRaises(TypeError, t.strftime, "%Z")
40854133

40864134
# Issue #6697:
4087-
if '_Fast' in self.__class__.__name__:
4088-
Badtzname.tz = '\ud800'
4089-
self.assertRaises(ValueError, t.strftime, "%Z")
4135+
Badtzname.tz = '\ud800'
4136+
self.assertEqual(t.strftime("%Z"), '\ud800')
40904137

40914138
def test_hash_edge_cases(self):
40924139
# Offsets that overflow a basic time.

Lib/test/test_time.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,33 @@ def test_strftime(self):
181181
self.fail('conversion specifier: %r failed.' % format)
182182

183183
self.assertRaises(TypeError, time.strftime, b'%S', tt)
184-
# embedded null character
185-
self.assertRaises(ValueError, time.strftime, '%S\0', tt)
184+
185+
def test_strftime_special(self):
186+
tt = time.gmtime(self.t)
187+
s1 = time.strftime('%c', tt)
188+
s2 = time.strftime('%B', tt)
189+
# gh-52551, gh-78662: Unicode strings should pass through strftime,
190+
# independently from locale.
191+
self.assertEqual(time.strftime('\U0001f40d', tt), '\U0001f40d')
192+
self.assertEqual(time.strftime('\U0001f4bb%c\U0001f40d%B', tt), f'\U0001f4bb{s1}\U0001f40d{s2}')
193+
self.assertEqual(time.strftime('%c\U0001f4bb%B\U0001f40d', tt), f'{s1}\U0001f4bb{s2}\U0001f40d')
194+
# Lone surrogates should pass through.
195+
self.assertEqual(time.strftime('\ud83d', tt), '\ud83d')
196+
self.assertEqual(time.strftime('\udc0d', tt), '\udc0d')
197+
self.assertEqual(time.strftime('\ud83d%c\udc0d%B', tt), f'\ud83d{s1}\udc0d{s2}')
198+
self.assertEqual(time.strftime('%c\ud83d%B\udc0d', tt), f'{s1}\ud83d{s2}\udc0d')
199+
self.assertEqual(time.strftime('%c\udc0d%B\ud83d', tt), f'{s1}\udc0d{s2}\ud83d')
200+
# Surrogate pairs should not recombine.
201+
self.assertEqual(time.strftime('\ud83d\udc0d', tt), '\ud83d\udc0d')
202+
self.assertEqual(time.strftime('%c\ud83d\udc0d%B', tt), f'{s1}\ud83d\udc0d{s2}')
203+
# Surrogate-escaped bytes should not recombine.
204+
self.assertEqual(time.strftime('\udcf0\udc9f\udc90\udc8d', tt), '\udcf0\udc9f\udc90\udc8d')
205+
self.assertEqual(time.strftime('%c\udcf0\udc9f\udc90\udc8d%B', tt), f'{s1}\udcf0\udc9f\udc90\udc8d{s2}')
206+
# gh-124531: The null character should not terminate the format string.
207+
self.assertEqual(time.strftime('\0', tt), '\0')
208+
self.assertEqual(time.strftime('\0'*1000, tt), '\0'*1000)
209+
self.assertEqual(time.strftime('\0%c\0%B', tt), f'\0{s1}\0{s2}')
210+
self.assertEqual(time.strftime('%c\0%B\0', tt), f'{s1}\0{s2}\0')
186211

187212
def _bounds_checking(self, func):
188213
# Make sure that strftime() checks the bounds of the various parts
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
Fix encoding issues in :func:`time.strftime`, the
2+
:meth:`~datetime.datetime.strftime` method of the :mod:`datetime` classes
3+
:class:`~datetime.datetime`, :class:`~datetime.date` and
4+
:class:`~datetime.time` and formatting of these classes. Characters not
5+
encodable in the current locale are now acceptable in the format string.
6+
Surrogate pairs and sequence of surrogatescape-encoded bytes are no longer
7+
recombinated. Embedded null character no longer terminates the format
8+
string.

Modules/_datetimemodule.c

Lines changed: 67 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,7 +1746,7 @@ make_somezreplacement(PyObject *object, char *sep, PyObject *tzinfoarg)
17461746
PyObject *tzinfo = get_tzinfo_member(object);
17471747

17481748
if (tzinfo == Py_None || tzinfo == NULL) {
1749-
return PyBytes_FromStringAndSize(NULL, 0);
1749+
return PyUnicode_FromStringAndSize(NULL, 0);
17501750
}
17511751

17521752
assert(tzinfoarg != NULL);
@@ -1757,7 +1757,7 @@ make_somezreplacement(PyObject *object, char *sep, PyObject *tzinfoarg)
17571757
tzinfoarg) < 0)
17581758
return NULL;
17591759

1760-
return PyBytes_FromStringAndSize(buf, strlen(buf));
1760+
return PyUnicode_FromString(buf);
17611761
}
17621762

17631763
static PyObject *
@@ -1814,7 +1814,7 @@ make_freplacement(PyObject *object)
18141814
else
18151815
sprintf(freplacement, "%06d", 0);
18161816

1817-
return PyBytes_FromStringAndSize(freplacement, strlen(freplacement));
1817+
return PyUnicode_FromString(freplacement);
18181818
}
18191819

18201820
/* I sure don't want to reproduce the strftime code from the time module,
@@ -1835,159 +1835,124 @@ wrap_strftime(PyObject *object, PyObject *format, PyObject *timetuple,
18351835
PyObject *Zreplacement = NULL; /* py string, replacement for %Z */
18361836
PyObject *freplacement = NULL; /* py string, replacement for %f */
18371837

1838-
const char *pin; /* pointer to next char in input format */
1839-
Py_ssize_t flen; /* length of input format */
1840-
char ch; /* next char in input format */
1841-
1842-
PyObject *newfmt = NULL; /* py string, the output format */
1843-
char *pnew; /* pointer to available byte in output format */
1844-
size_t totalnew; /* number bytes total in output format buffer,
1845-
exclusive of trailing \0 */
1846-
size_t usednew; /* number bytes used so far in output format buffer */
1847-
1848-
const char *ptoappend; /* ptr to string to append to output buffer */
1849-
Py_ssize_t ntoappend; /* # of bytes to append to output buffer */
1850-
18511838
assert(object && format && timetuple);
18521839
assert(PyUnicode_Check(format));
1853-
/* Convert the input format to a C string and size */
1854-
pin = PyUnicode_AsUTF8AndSize(format, &flen);
1855-
if (!pin)
1840+
1841+
PyObject *strftime = _PyImport_GetModuleAttrString("time", "strftime");
1842+
if (strftime == NULL) {
18561843
return NULL;
1844+
}
18571845

18581846
/* Scan the input format, looking for %z/%Z/%f escapes, building
18591847
* a new format. Since computing the replacements for those codes
18601848
* is expensive, don't unless they're actually used.
18611849
*/
1862-
if (flen > INT_MAX - 1) {
1863-
PyErr_NoMemory();
1864-
goto Done;
1865-
}
18661850

1867-
totalnew = flen + 1; /* realistic if no %z/%Z */
1868-
newfmt = PyBytes_FromStringAndSize(NULL, totalnew);
1869-
if (newfmt == NULL) goto Done;
1870-
pnew = PyBytes_AsString(newfmt);
1871-
usednew = 0;
1872-
1873-
while ((ch = *pin++) != '\0') {
1874-
if (ch != '%') {
1875-
ptoappend = pin - 1;
1876-
ntoappend = 1;
1851+
_PyUnicodeWriter writer;
1852+
_PyUnicodeWriter_Init(&writer);
1853+
writer.overallocate = 1;
1854+
1855+
Py_ssize_t flen = PyUnicode_GET_LENGTH(format);
1856+
Py_ssize_t i = 0;
1857+
Py_ssize_t start = 0;
1858+
Py_ssize_t end = 0;
1859+
while (i != flen) {
1860+
i = PyUnicode_FindChar(format, '%', i, flen, 1);
1861+
if (i < 0) {
1862+
assert(!PyErr_Occurred());
1863+
break;
18771864
}
1878-
else if ((ch = *pin++) == '\0') {
1879-
/* Null byte follows %, copy only '%'.
1880-
*
1881-
* Back the pin up one char so that we catch the null check
1882-
* the next time through the loop.*/
1883-
pin--;
1884-
ptoappend = pin - 1;
1885-
ntoappend = 1;
1865+
end = i;
1866+
i++;
1867+
if (i == flen) {
1868+
break;
18861869
}
1870+
Py_UCS4 ch = PyUnicode_READ_CHAR(format, i);
1871+
i++;
18871872
/* A % has been seen and ch is the character after it. */
1888-
else if (ch == 'z') {
1873+
PyObject *replacement = NULL;
1874+
if (ch == 'z') {
18891875
/* %z -> +HHMM */
18901876
if (zreplacement == NULL) {
18911877
zreplacement = make_somezreplacement(object, "", tzinfoarg);
18921878
if (zreplacement == NULL)
1893-
goto Done;
1879+
goto Error;
18941880
}
1895-
assert(zreplacement != NULL);
1896-
assert(PyBytes_Check(zreplacement));
1897-
ptoappend = PyBytes_AS_STRING(zreplacement);
1898-
ntoappend = PyBytes_GET_SIZE(zreplacement);
1881+
replacement = zreplacement;
18991882
}
1900-
else if (ch == ':' && *pin == 'z' && pin++) {
1883+
else if (ch == ':' && i < flen && PyUnicode_READ_CHAR(format, i) == 'z') {
19011884
/* %:z -> +HH:MM */
1885+
i++;
19021886
if (colonzreplacement == NULL) {
19031887
colonzreplacement = make_somezreplacement(object, ":", tzinfoarg);
19041888
if (colonzreplacement == NULL)
1905-
goto Done;
1889+
goto Error;
19061890
}
1907-
assert(colonzreplacement != NULL);
1908-
assert(PyBytes_Check(colonzreplacement));
1909-
ptoappend = PyBytes_AS_STRING(colonzreplacement);
1910-
ntoappend = PyBytes_GET_SIZE(colonzreplacement);
1891+
replacement = colonzreplacement;
19111892
}
19121893
else if (ch == 'Z') {
19131894
/* format tzname */
19141895
if (Zreplacement == NULL) {
19151896
Zreplacement = make_Zreplacement(object,
19161897
tzinfoarg);
19171898
if (Zreplacement == NULL)
1918-
goto Done;
1899+
goto Error;
19191900
}
1920-
assert(Zreplacement != NULL);
1921-
assert(PyUnicode_Check(Zreplacement));
1922-
ptoappend = PyUnicode_AsUTF8AndSize(Zreplacement,
1923-
&ntoappend);
1924-
if (ptoappend == NULL)
1925-
goto Done;
1901+
replacement = Zreplacement;
19261902
}
19271903
else if (ch == 'f') {
19281904
/* format microseconds */
19291905
if (freplacement == NULL) {
19301906
freplacement = make_freplacement(object);
19311907
if (freplacement == NULL)
1932-
goto Done;
1908+
goto Error;
19331909
}
1934-
assert(freplacement != NULL);
1935-
assert(PyBytes_Check(freplacement));
1936-
ptoappend = PyBytes_AS_STRING(freplacement);
1937-
ntoappend = PyBytes_GET_SIZE(freplacement);
1910+
replacement = freplacement;
19381911
}
19391912
else {
19401913
/* percent followed by something else */
1941-
ptoappend = pin - 2;
1942-
ntoappend = 2;
1943-
}
1944-
1945-
/* Append the ntoappend chars starting at ptoappend to
1946-
* the new format.
1947-
*/
1948-
if (ntoappend == 0)
19491914
continue;
1950-
assert(ptoappend != NULL);
1951-
assert(ntoappend > 0);
1952-
while (usednew + ntoappend > totalnew) {
1953-
if (totalnew > (PY_SSIZE_T_MAX >> 1)) { /* overflow */
1954-
PyErr_NoMemory();
1955-
goto Done;
1956-
}
1957-
totalnew <<= 1;
1958-
if (_PyBytes_Resize(&newfmt, totalnew) < 0)
1959-
goto Done;
1960-
pnew = PyBytes_AsString(newfmt) + usednew;
19611915
}
1962-
memcpy(pnew, ptoappend, ntoappend);
1963-
pnew += ntoappend;
1964-
usednew += ntoappend;
1965-
assert(usednew <= totalnew);
1916+
assert(replacement != NULL);
1917+
assert(PyUnicode_Check(replacement));
1918+
if (_PyUnicodeWriter_WriteSubstring(&writer, format, start, end) < 0) {
1919+
goto Error;
1920+
}
1921+
start = i;
1922+
if (_PyUnicodeWriter_WriteStr(&writer, replacement) < 0) {
1923+
goto Error;
1924+
}
19661925
} /* end while() */
19671926

1968-
if (_PyBytes_Resize(&newfmt, usednew) < 0)
1969-
goto Done;
1970-
{
1971-
PyObject *format;
1972-
PyObject *strftime = _PyImport_GetModuleAttrString("time", "strftime");
1973-
1974-
if (strftime == NULL)
1927+
PyObject *newformat;
1928+
if (start == 0) {
1929+
_PyUnicodeWriter_Dealloc(&writer);
1930+
newformat = Py_NewRef(format);
1931+
}
1932+
else {
1933+
if (_PyUnicodeWriter_WriteSubstring(&writer, format, start, flen) < 0) {
1934+
goto Error;
1935+
}
1936+
newformat = _PyUnicodeWriter_Finish(&writer);
1937+
if (newformat == NULL) {
19751938
goto Done;
1976-
format = PyUnicode_FromString(PyBytes_AS_STRING(newfmt));
1977-
if (format != NULL) {
1978-
result = PyObject_CallFunctionObjArgs(strftime,
1979-
format, timetuple, NULL);
1980-
Py_DECREF(format);
19811939
}
1982-
Py_DECREF(strftime);
19831940
}
1941+
result = PyObject_CallFunctionObjArgs(strftime,
1942+
newformat, timetuple, NULL);
1943+
Py_DECREF(newformat);
1944+
19841945
Done:
19851946
Py_XDECREF(freplacement);
19861947
Py_XDECREF(zreplacement);
19871948
Py_XDECREF(colonzreplacement);
19881949
Py_XDECREF(Zreplacement);
1989-
Py_XDECREF(newfmt);
1950+
Py_XDECREF(strftime);
19901951
return result;
1952+
1953+
Error:
1954+
_PyUnicodeWriter_Dealloc(&writer);
1955+
goto Done;
19911956
}
19921957

19931958
/* ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)