Difference between revisions of "Python - making a filter that doesn't choke"
Jump to navigation
Jump to search
(4 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | This is a note to self that is probably also a useful PSA. Making a traditional UNIX filter command in python that | + | This is a note to self that is probably also a useful PSA. Making a traditional UNIX filter command in python that '''doesn't''' choke on (or mangle) any data is nontrivial. |
− | < | + | '''Update:''' The defaults will change in 3.7: https://docs.python.org/3.7/whatsnew/3.7.html - so eventually this complexity will be unneeded \o/ |
+ | |||
+ | <syntaxhighlight lang="python"> | ||
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||
− | |||
− | |||
− | |||
import sys | import sys | ||
Line 13: | Line 12: | ||
# | # | ||
# Reason: assumes input is valid utf-8 | # Reason: assumes input is valid utf-8 | ||
− | + | ||
for line in sys.stdin: | for line in sys.stdin: | ||
print(line) | print(line) | ||
− | + | </syntaxhighlight> | |
− | import io | + | <syntaxhighlight lang="python"> |
+ | #!/usr/bin/env python3 | ||
+ | |||
+ | import sys, io | ||
# TAKE 2: fails on print: UnicodeEncodeError: 'utf-8' codec can't encode | # TAKE 2: fails on print: UnicodeEncodeError: 'utf-8' codec can't encode | ||
Line 28: | Line 30: | ||
# all bytes, but fails to print it out again since this internal representation | # all bytes, but fails to print it out again since this internal representation | ||
# has no implicit conversion to utf-8 | # has no implicit conversion to utf-8 | ||
− | + | ||
stdi = io.TextIOWrapper(sys.stdin.buffer, | stdi = io.TextIOWrapper(sys.stdin.buffer, | ||
encoding='utf-8', errors='surrogateescape') | encoding='utf-8', errors='surrogateescape') | ||
for line in stdi: | for line in stdi: | ||
print(line) | print(line) | ||
− | "" | + | </syntaxhighlight> |
+ | |||
+ | <syntaxhighlight lang="python"> | ||
+ | #!/usr/bin/env python3 | ||
+ | |||
+ | import sys, io | ||
# TAKE 3: works? except cr/lf | # TAKE 3: works? except cr/lf | ||
Line 39: | Line 46: | ||
# Reason: data now comes through unchanged via surrogate hack, but newline | # Reason: data now comes through unchanged via surrogate hack, but newline | ||
# conversions are done so it's still lossy | # conversions are done so it's still lossy | ||
− | + | ||
stdi = io.TextIOWrapper(sys.stdin.buffer, | stdi = io.TextIOWrapper(sys.stdin.buffer, | ||
encoding='utf-8', errors='surrogateescape') | encoding='utf-8', errors='surrogateescape') | ||
for line in stdi: | for line in stdi: | ||
sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape')) | sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape')) | ||
− | "" | + | </syntaxhighlight> |
+ | |||
+ | <syntaxhighlight lang="python"> | ||
+ | #!/usr/bin/env python3 | ||
+ | |||
+ | import sys | ||
# TAKE 4: pure binary, works but you don't pretend it's text and lines | # TAKE 4: pure binary, works but you don't pretend it's text and lines | ||
Line 50: | Line 62: | ||
# Reason: this is nice and clean and obviously what you want if you have no | # Reason: this is nice and clean and obviously what you want if you have no | ||
# need to reason about it as text or lines | # need to reason about it as text or lines | ||
− | + | ||
sys.stdout.buffer.write(sys.stdin.buffer.read()) | sys.stdout.buffer.write(sys.stdin.buffer.read()) | ||
− | "" | + | </syntaxhighlight> |
+ | |||
+ | <syntaxhighlight lang="python"> | ||
+ | #!/usr/bin/env python3 | ||
+ | |||
+ | import sys, io | ||
# TAKE 5: works | # TAKE 5: works | ||
Line 59: | Line 76: | ||
# same output as input, while pretending it's text and lines. whew! it's not | # same output as input, while pretending it's text and lines. whew! it's not | ||
# clear to me why this is not implicit default for sys.stdin, as it is for | # clear to me why this is not implicit default for sys.stdin, as it is for | ||
− | # sys.argv and os.environ | + | # sys.argv and os.environ (update: changes in 3.7) |
stdi = io.TextIOWrapper(sys.stdin.buffer, | stdi = io.TextIOWrapper(sys.stdin.buffer, | ||
encoding='utf-8', errors='surrogateescape', newline='') | encoding='utf-8', errors='surrogateescape', newline='') | ||
for line in stdi: | for line in stdi: | ||
sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape')) | sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape')) | ||
− | </ | + | </syntaxhighlight> |
Latest revision as of 09:00, 16 January 2020
This is a note to self that is probably also a useful PSA. Making a traditional UNIX filter command in python that doesn't choke on (or mangle) any data is nontrivial.
Update: The defaults will change in 3.7: https://docs.python.org/3.7/whatsnew/3.7.html - so eventually this complexity will be unneeded \o/
#!/usr/bin/env python3
import sys
# TAKE 1: fails on read: UnicodeDecodeError: 'utf-8' codec can't decode
# byte 0xe5 in position 1: invalid continuation byte
#
# Reason: assumes input is valid utf-8
for line in sys.stdin:
print(line)
#!/usr/bin/env python3
import sys, io
# TAKE 2: fails on print: UnicodeEncodeError: 'utf-8' codec can't encode
# character '\udce5' in position 1: surrogates not allowed
#
# Works with errors='replace' but is then lossy.
#
# Reason: will now accept any input since the py3 surrogate hack can represent
# all bytes, but fails to print it out again since this internal representation
# has no implicit conversion to utf-8
stdi = io.TextIOWrapper(sys.stdin.buffer,
encoding='utf-8', errors='surrogateescape')
for line in stdi:
print(line)
#!/usr/bin/env python3
import sys, io
# TAKE 3: works? except cr/lf
#
# Reason: data now comes through unchanged via surrogate hack, but newline
# conversions are done so it's still lossy
stdi = io.TextIOWrapper(sys.stdin.buffer,
encoding='utf-8', errors='surrogateescape')
for line in stdi:
sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape'))
#!/usr/bin/env python3
import sys
# TAKE 4: pure binary, works but you don't pretend it's text and lines
#
# Reason: this is nice and clean and obviously what you want if you have no
# need to reason about it as text or lines
sys.stdout.buffer.write(sys.stdin.buffer.read())
#!/usr/bin/env python3
import sys, io
# TAKE 5: works
#
# Reason: instructed to ignore newline conversions, we now seem to get the
# same output as input, while pretending it's text and lines. whew! it's not
# clear to me why this is not implicit default for sys.stdin, as it is for
# sys.argv and os.environ (update: changes in 3.7)
stdi = io.TextIOWrapper(sys.stdin.buffer,
encoding='utf-8', errors='surrogateescape', newline='')
for line in stdi:
sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape'))