Auto-detect different Korean encodings in current directory (UTF-8, CP949, EUC-KR). Helpful when processing large amounts of Korean data with inconsistent encoding. Probably should be run recursively.
#!/usr/bin/python3
import os
from os import listdir, mkdir
from os.path import isfile, join
mypath = '.'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for f in onlyfiles:
contents = None
try:
fd = open(f, 'r', encoding='utf-8')
contents = fd.read()
fd.close()
print('%s: UTF-8' % f)
except:
try:
fd = open(f, 'r', encoding='cp949')
contents = fd.read()
fd.close()
print('%s: CP949' % f)
except:
try:
fd = open(f, 'r', encoding='euc-kr')
contents = fd.read()
fd.close()
print('%s: EUC-KR' % f)
except:
print('%s: Unknown' % f)