Experimental code to convert sql.gz files full of SQL commands into usable CSV files, memory-efficiently.
'''
Convert humongous compressed sql.gz files full of SQL commands into usable
CSV files, memory-efficiently.
'''
import gzip
import html
import sys
import string
import unicodedata
import pickle
from html.parser import HTMLParser
#gzfile = gzip.open('enwiki-latest-langlinks.sql.gz')
#outfile = open('enwiki-latest-langlinks.csv', 'w', encoding='utf-8')
if len(sys.argv) < 2:
print('Syntax: %s file_to_import.sql.gz' % sys.argv[0])
sys.exit(1)
if not sys.argv[1].lower().endswith('.sql.gz'):
print('Input must be .sql.gz file')
sys.exit(1)
input_base_name = sys.argv[1][:-len('.sql.gz')]
# if the case of extension is different, still need to use sys.argv[1]
gzfile = gzip.open(sys.argv[1])
# open csv output file with base name
outfile = open('%s.csv' % input_base_name, 'w', encoding='utf-8')
# Try to detect table column names and output
ENABLE_HEADER = True
# Surround fields with double-quotes, and escape internal double-quotes
ENABLE_DOUBLE_QUOTES = True
'''
Strip CR/LF new-line characters from data
'''
def stripCRLF(x):
return x.replace('\r', ' ').replace('\n', ' ')
'''
Replace unescaped double-quotes with a pair of double-quotes
to ensure even number of double-quotes
'''
def escapeDQ(x):
return x.replace('"', '""')
lnnum = 0
table_count = 0
header_started = False
header_contents = ''
header_columns = []
'''
Save to file
'''
def commit(d):
if ENABLE_HEADER:
assert len(d) == len(header_columns), 'header column mismatch'
if ENABLE_DOUBLE_QUOTES:
csv = ','.join(['"%s"' % escapeDQ(stripCRLF(x)) for x in d])
outfile.write(csv+'\n')
else:
csv = ','.join(['%s' % stripCRLF(x) for x in d])
outfile.write(csv+'\n')
def processHeader(contents):
contents = contents[contents.index('(')+1:]
clauses = contents.split(',')
hdrs = []
for clause in clauses:
clause = clause.strip()
#print('CLAUSE', clause)
if clause.startswith("`"):
# it's a real column name (at least for the Wikipedia dump format)
hdrs.append(clause.split("`")[1].split("`")[0])
else:
# as soon as these stop, we're done
break # that's it
print('Detected table headers:', hdrs)
return hdrs
for ln in gzfile:
lnnum += 1
ln = ln.decode('utf-8', errors='ignore').strip()
# just in case...
assert len(ln) <= 10000000, 'line too long'
if ln.startswith('CREATE TABLE'):
table_count += 1
assert table_count == 1, 'only one table is supported per file'
if ENABLE_HEADER:
header_started = True
if ENABLE_HEADER:
if header_started:
if ln.startswith('INSERT INTO'):
header_started = False
# process header
header_columns = processHeader(header_contents)
# write as first line to file
commit(header_columns)
else:
header_contents += ln + '\n'
if not ln.startswith('INSERT INTO'):
continue # not important line
ln = ln[ln.index('VALUES'):].strip()
print('Processing line %d...' % lnnum)
print(len(ln), ln[0:40], ln[-40:])
stack = []
data = []
escape_bit = False
commit_count = 0
for c in ln:
#print(stack, data)
#print('STACK:', stack)
#print('DATA:', data)
#print('CHAR:', c)
if c == '\\':
if escape_bit:
escape_bit = False
# add actual \\ to data
tmp = data[-1] + c
data[-1] = tmp
else:
escape_bit = True
else:
if not escape_bit:
if c == '(':
#assert '(' not in stack
# actually parentheses don't need to be escaped
if "'" not in stack:
stack.append('(')
data.append('') # expect new data here
else:
# act as normal parentheses character
tmp = data[-1] + c
data[-1] = tmp
elif c == "'":
if '(' in stack:
if "'" not in stack:
stack.append("'")
else:
stack = stack[:-1]
#tmp = data[-1] + c
#data[-1] = tmp
elif c == ')':
if '(' in stack:
if "'" not in stack:
commit(data)
commit_count += 1
#if commit_count == 50:
# sys.exit(1)
data = []
stack = []
else:
# act as normal parentheses character
tmp = data[-1] + c
data[-1] = tmp
elif c == ',':
if '(' in stack:
if "'" not in stack:
stack.append(",")
data.append('') # expect new data here
else:
# act as normal comma character
tmp = data[-1] + c
data[-1] = tmp
else:
if '(' in stack:
tmp = data[-1] + c
data[-1] = tmp
else:
if '(' in stack:
if c == 'n' and escape_bit:
# \n newline
tmp = data[-1] + '\n'
data[-1] = tmp
elif c == 'r' and escape_bit:
# \r newline
tmp = data[-1] + '\r'
data[-1] = tmp
elif c == 't' and escape_bit:
# \t tab
tmp = data[-1] + '\t'
data[-1] = tmp
elif c == "'" and escape_bit:
# \' single-quote
tmp = data[-1] + "'"
data[-1] = tmp
elif c == '"' and escape_bit:
# \" double-quote
tmp = data[-1] + '"'
data[-1] = tmp
else:
print('Unknown escape sequence: \\%s' % c)
tmp = data[-1] + '\\' + c
data[-1] = tmp
escape_bit = False # reset escape_bit
else:
assert None, 'escape outside of clause?'
#else:
# if c == ',':
# data = []
# stack = []
# else:
# print('unknown: end?', c)
#break
#
#outfile.write()
outfile.flush()
outfile.close()