Auto-detect different Korean encodings in current directory (UTF-8, CP949, EUC-KR). Helpful when processing large amounts of Korean data with inconsistent encoding. Probably should be run recursively.
#!/usr/bin/python3 import os from os import listdir, mkdir from os.path import isfile, join mypath = '.' onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] for f in onlyfiles: contents = None try: fd = open(f, 'r', encoding='utf-8') contents = fd.read() fd.close() print('%s: UTF-8' % f) except: try: fd = open(f, 'r', encoding='cp949') contents = fd.read() fd.close() print('%s: CP949' % f) except: try: fd = open(f, 'r', encoding='euc-kr') contents = fd.read() fd.close() print('%s: EUC-KR' % f) except: print('%s: Unknown' % f)