Experimental code to convert sql.gz files full of SQL commands into usable CSV files, memory-efficiently.
''' Convert humongous compressed sql.gz files full of SQL commands into usable CSV files, memory-efficiently. ''' import gzip import html import sys import string import unicodedata import pickle from html.parser import HTMLParser #gzfile = gzip.open('enwiki-latest-langlinks.sql.gz') #outfile = open('enwiki-latest-langlinks.csv', 'w', encoding='utf-8') if len(sys.argv) < 2: print('Syntax: %s file_to_import.sql.gz' % sys.argv[0]) sys.exit(1) if not sys.argv[1].lower().endswith('.sql.gz'): print('Input must be .sql.gz file') sys.exit(1) input_base_name = sys.argv[1][:-len('.sql.gz')] # if the case of extension is different, still need to use sys.argv[1] gzfile = gzip.open(sys.argv[1]) # open csv output file with base name outfile = open('%s.csv' % input_base_name, 'w', encoding='utf-8') # Try to detect table column names and output ENABLE_HEADER = True # Surround fields with double-quotes, and escape internal double-quotes ENABLE_DOUBLE_QUOTES = True ''' Strip CR/LF new-line characters from data ''' def stripCRLF(x): return x.replace('\r', ' ').replace('\n', ' ') ''' Replace unescaped double-quotes with a pair of double-quotes to ensure even number of double-quotes ''' def escapeDQ(x): return x.replace('"', '""') lnnum = 0 table_count = 0 header_started = False header_contents = '' header_columns = [] ''' Save to file ''' def commit(d): if ENABLE_HEADER: assert len(d) == len(header_columns), 'header column mismatch' if ENABLE_DOUBLE_QUOTES: csv = ','.join(['"%s"' % escapeDQ(stripCRLF(x)) for x in d]) outfile.write(csv+'\n') else: csv = ','.join(['%s' % stripCRLF(x) for x in d]) outfile.write(csv+'\n') def processHeader(contents): contents = contents[contents.index('(')+1:] clauses = contents.split(',') hdrs = [] for clause in clauses: clause = clause.strip() #print('CLAUSE', clause) if clause.startswith("`"): # it's a real column name (at least for the Wikipedia dump format) hdrs.append(clause.split("`")[1].split("`")[0]) else: # as soon as these stop, we're done break # that's it print('Detected table headers:', hdrs) return hdrs for ln in gzfile: lnnum += 1 ln = ln.decode('utf-8', errors='ignore').strip() # just in case... assert len(ln) <= 10000000, 'line too long' if ln.startswith('CREATE TABLE'): table_count += 1 assert table_count == 1, 'only one table is supported per file' if ENABLE_HEADER: header_started = True if ENABLE_HEADER: if header_started: if ln.startswith('INSERT INTO'): header_started = False # process header header_columns = processHeader(header_contents) # write as first line to file commit(header_columns) else: header_contents += ln + '\n' if not ln.startswith('INSERT INTO'): continue # not important line ln = ln[ln.index('VALUES'):].strip() print('Processing line %d...' % lnnum) print(len(ln), ln[0:40], ln[-40:]) stack = [] data = [] escape_bit = False commit_count = 0 for c in ln: #print(stack, data) #print('STACK:', stack) #print('DATA:', data) #print('CHAR:', c) if c == '\\': if escape_bit: escape_bit = False # add actual \\ to data tmp = data[-1] + c data[-1] = tmp else: escape_bit = True else: if not escape_bit: if c == '(': #assert '(' not in stack # actually parentheses don't need to be escaped if "'" not in stack: stack.append('(') data.append('') # expect new data here else: # act as normal parentheses character tmp = data[-1] + c data[-1] = tmp elif c == "'": if '(' in stack: if "'" not in stack: stack.append("'") else: stack = stack[:-1] #tmp = data[-1] + c #data[-1] = tmp elif c == ')': if '(' in stack: if "'" not in stack: commit(data) commit_count += 1 #if commit_count == 50: # sys.exit(1) data = [] stack = [] else: # act as normal parentheses character tmp = data[-1] + c data[-1] = tmp elif c == ',': if '(' in stack: if "'" not in stack: stack.append(",") data.append('') # expect new data here else: # act as normal comma character tmp = data[-1] + c data[-1] = tmp else: if '(' in stack: tmp = data[-1] + c data[-1] = tmp else: if '(' in stack: if c == 'n' and escape_bit: # \n newline tmp = data[-1] + '\n' data[-1] = tmp elif c == 'r' and escape_bit: # \r newline tmp = data[-1] + '\r' data[-1] = tmp elif c == 't' and escape_bit: # \t tab tmp = data[-1] + '\t' data[-1] = tmp elif c == "'" and escape_bit: # \' single-quote tmp = data[-1] + "'" data[-1] = tmp elif c == '"' and escape_bit: # \" double-quote tmp = data[-1] + '"' data[-1] = tmp else: print('Unknown escape sequence: \\%s' % c) tmp = data[-1] + '\\' + c data[-1] = tmp escape_bit = False # reset escape_bit else: assert None, 'escape outside of clause?' #else: # if c == ',': # data = [] # stack = [] # else: # print('unknown: end?', c) #break # #outfile.write() outfile.flush() outfile.close()