Difference between revisions of "Python - making a filter that doesn't choke"

From WTFwiki
Jump to navigation Jump to search
Line 5: Line 5:
 
<syntaxhighlight lang="python">
 
<syntaxhighlight lang="python">
 
#!/usr/bin/env python3
 
#!/usr/bin/env python3
 
# How to make a line-based filter in py3 without failing.  Note, do not
 
# attempt in py2.
 
  
 
import sys
 
import sys
Line 15: Line 12:
 
#
 
#
 
# Reason: assumes input is valid utf-8
 
# Reason: assumes input is valid utf-8
"""
+
 
 
for line in sys.stdin:
 
for line in sys.stdin:
 
     print(line)
 
     print(line)
"""
+
</syntaxhighlight>
 +
 
 +
<syntaxhighlight lang="python">
 +
#!/usr/bin/env python3
  
 
import io
 
import io
Line 30: Line 30:
 
# all bytes, but fails to print it out again since this internal representation
 
# all bytes, but fails to print it out again since this internal representation
 
# has no implicit conversion to utf-8
 
# has no implicit conversion to utf-8
"""
+
 
 
stdi = io.TextIOWrapper(sys.stdin.buffer,
 
stdi = io.TextIOWrapper(sys.stdin.buffer,
 
                         encoding='utf-8', errors='surrogateescape')
 
                         encoding='utf-8', errors='surrogateescape')
 
for line in stdi:
 
for line in stdi:
 
     print(line)
 
     print(line)
"""
+
</syntaxhighlight>
 +
 
 +
<syntaxhighlight lang="python">
 +
#!/usr/bin/env python3
  
 
# TAKE 3: works?  except cr/lf
 
# TAKE 3: works?  except cr/lf
Line 41: Line 44:
 
# Reason: data now comes through unchanged via surrogate hack, but newline
 
# Reason: data now comes through unchanged via surrogate hack, but newline
 
# conversions are done so it's still lossy
 
# conversions are done so it's still lossy
"""
+
 
 
stdi = io.TextIOWrapper(sys.stdin.buffer,
 
stdi = io.TextIOWrapper(sys.stdin.buffer,
 
                         encoding='utf-8', errors='surrogateescape')
 
                         encoding='utf-8', errors='surrogateescape')
 
for line in stdi:
 
for line in stdi:
 
     sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape'))
 
     sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape'))
"""
+
</syntaxhighlight>
 +
 
 +
<syntaxhighlight lang="python">
 +
#!/usr/bin/env python3
  
 
# TAKE 4: pure binary, works but you don't pretend it's text and lines
 
# TAKE 4: pure binary, works but you don't pretend it's text and lines
Line 52: Line 58:
 
# Reason: this is nice and clean and obviously what you want if you have no
 
# Reason: this is nice and clean and obviously what you want if you have no
 
# need to reason about it as text or lines
 
# need to reason about it as text or lines
"""
+
 
 
sys.stdout.buffer.write(sys.stdin.buffer.read())
 
sys.stdout.buffer.write(sys.stdin.buffer.read())
"""
+
</syntaxhighlight>
 +
 
 +
<syntaxhighlight lang="python">
 +
#!/usr/bin/env python3
  
 
# TAKE 5: works
 
# TAKE 5: works
Line 61: Line 70:
 
# same output as input, while pretending it's text and lines.  whew!  it's not
 
# same output as input, while pretending it's text and lines.  whew!  it's not
 
# clear to me why this is not implicit default for sys.stdin, as it is for
 
# clear to me why this is not implicit default for sys.stdin, as it is for
# sys.argv and os.environ
+
# sys.argv and os.environ (update: changes in 3.7)
 
stdi = io.TextIOWrapper(sys.stdin.buffer,
 
stdi = io.TextIOWrapper(sys.stdin.buffer,
 
                         encoding='utf-8', errors='surrogateescape', newline='')
 
                         encoding='utf-8', errors='surrogateescape', newline='')

Revision as of 07:38, 16 January 2020

This is a note to self that is probably also a useful PSA. Making a traditional UNIX filter command in python that doesn't choke on (or mangle) any data is nontrivial.

Update: The defaults will change in 3.7: https://docs.python.org/3.7/whatsnew/3.7.html - so eventually this complexity will be unneeded \o/

#!/usr/bin/env python3

import sys

# TAKE 1: fails on read: UnicodeDecodeError: 'utf-8' codec can't decode
# byte 0xe5 in position 1: invalid continuation byte
#
# Reason: assumes input is valid utf-8

for line in sys.stdin:
    print(line)
#!/usr/bin/env python3

import io

# TAKE 2: fails on print: UnicodeEncodeError: 'utf-8' codec can't encode
# character '\udce5' in position 1: surrogates not allowed
#
# Works with errors='replace' but is then lossy.
#
# Reason: will now accept any input since the py3 surrogate hack can represent
# all bytes, but fails to print it out again since this internal representation
# has no implicit conversion to utf-8

stdi = io.TextIOWrapper(sys.stdin.buffer,
                        encoding='utf-8', errors='surrogateescape')
for line in stdi:
    print(line)
#!/usr/bin/env python3

# TAKE 3: works?  except cr/lf
#
# Reason: data now comes through unchanged via surrogate hack, but newline
# conversions are done so it's still lossy

stdi = io.TextIOWrapper(sys.stdin.buffer,
                        encoding='utf-8', errors='surrogateescape')
for line in stdi:
    sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape'))
#!/usr/bin/env python3

# TAKE 4: pure binary, works but you don't pretend it's text and lines
#
# Reason: this is nice and clean and obviously what you want if you have no
# need to reason about it as text or lines

sys.stdout.buffer.write(sys.stdin.buffer.read())
#!/usr/bin/env python3

# TAKE 5: works
#
# Reason: instructed to ignore newline conversions, we now seem to get the
# same output as input, while pretending it's text and lines.  whew!  it's not
# clear to me why this is not implicit default for sys.stdin, as it is for
# sys.argv and os.environ (update: changes in 3.7)
stdi = io.TextIOWrapper(sys.stdin.buffer,
                        encoding='utf-8', errors='surrogateescape', newline='')
for line in stdi:
    sys.stdout.buffer.write(line.encode('utf-8', errors='surrogateescape'))