Convert huge SQL files to CSV files (experimental)

Experimental code to convert sql.gz files full of SQL commands into usable CSV files, memory-efficiently.

 

'''
Convert humongous compressed sql.gz files full of SQL commands into usable
CSV files, memory-efficiently.
'''

import gzip
import html
import sys
import string
import unicodedata
import pickle
from html.parser import HTMLParser

#gzfile = gzip.open('enwiki-latest-langlinks.sql.gz')
#outfile = open('enwiki-latest-langlinks.csv', 'w', encoding='utf-8')

if len(sys.argv) < 2:
  print('Syntax: %s file_to_import.sql.gz' % sys.argv[0])
  sys.exit(1)

if not sys.argv[1].lower().endswith('.sql.gz'):
  print('Input must be .sql.gz file')
  sys.exit(1)

input_base_name = sys.argv[1][:-len('.sql.gz')]

# if the case of extension is different, still need to use sys.argv[1]
gzfile = gzip.open(sys.argv[1])

# open csv output file with base name
outfile = open('%s.csv' % input_base_name, 'w', encoding='utf-8')

# Try to detect table column names and output
ENABLE_HEADER = True

# Surround fields with double-quotes, and escape internal double-quotes
ENABLE_DOUBLE_QUOTES = True

'''
Strip CR/LF new-line characters from data
'''
def stripCRLF(x):
  return x.replace('\r', ' ').replace('\n', ' ')

'''
Replace unescaped double-quotes with a pair of double-quotes
to ensure even number of double-quotes
'''
def escapeDQ(x):
  return x.replace('"', '""')

lnnum = 0
table_count = 0
header_started = False
header_contents = ''
header_columns = []

'''
Save to file
'''
def commit(d):
  if ENABLE_HEADER:
    assert len(d) == len(header_columns), 'header column mismatch'

  if ENABLE_DOUBLE_QUOTES:
    csv = ','.join(['"%s"' % escapeDQ(stripCRLF(x)) for x in d])
    outfile.write(csv+'\n')
  else:
    csv = ','.join(['%s' % stripCRLF(x) for x in d])
    outfile.write(csv+'\n')

def processHeader(contents):
  contents = contents[contents.index('(')+1:]
  clauses = contents.split(',')
  hdrs = []

  for clause in clauses:
    clause = clause.strip()
    #print('CLAUSE', clause)
    if clause.startswith("`"):
      # it's a real column name (at least for the Wikipedia dump format)
      hdrs.append(clause.split("`")[1].split("`")[0])
    else:
      # as soon as these stop, we're done
      break # that's it
  print('Detected table headers:', hdrs)
  return hdrs

for ln in gzfile:
  lnnum += 1
  ln = ln.decode('utf-8', errors='ignore').strip()

  # just in case...
  assert len(ln) <= 10000000, 'line too long'

  if ln.startswith('CREATE TABLE'):
    table_count += 1
    assert table_count == 1, 'only one table is supported per file'

    if ENABLE_HEADER:
      header_started = True

  if ENABLE_HEADER:
    if header_started:
      if ln.startswith('INSERT INTO'):
        header_started = False
        # process header
        header_columns = processHeader(header_contents)

        # write as first line to file
        commit(header_columns)
      else:
        header_contents += ln + '\n'
    
  if not ln.startswith('INSERT INTO'):
    continue # not important line

  ln = ln[ln.index('VALUES'):].strip()

  print('Processing line %d...' % lnnum)
  print(len(ln), ln[0:40], ln[-40:])

  stack = []
  data = []
  escape_bit = False
  commit_count = 0

  for c in ln:
    #print(stack, data)

    #print('STACK:', stack)
    #print('DATA:', data)
    #print('CHAR:', c)

    if c == '\\':
      if escape_bit:
        escape_bit = False
        # add actual \\ to data
        tmp = data[-1] + c
        data[-1] = tmp
      else:
        escape_bit = True
    else:
      if not escape_bit:
        if c == '(':
          #assert '(' not in stack
          # actually parentheses don't need to be escaped

          if "'" not in stack:
            stack.append('(')
            data.append('')     # expect new data here
          else:
            # act as normal parentheses character
            tmp = data[-1] + c
            data[-1] = tmp
        elif c == "'":
          if '(' in stack:
            if "'" not in stack:
              stack.append("'")
            else:
              stack = stack[:-1]
              #tmp = data[-1] + c
              #data[-1] = tmp
        elif c == ')':
          if '(' in stack:
            if "'" not in stack:
              commit(data)
              commit_count += 1
              #if commit_count == 50:
              #	sys.exit(1)
              data = []
              stack = []
            else:
              # act as normal parentheses character
              tmp = data[-1] + c
              data[-1] = tmp
        elif c == ',':
          if '(' in stack:
            if "'" not in stack:
              stack.append(",")
              data.append('')     # expect new data here
            else:
              # act as normal comma character
              tmp = data[-1] + c
              data[-1] = tmp
        else:
          if '(' in stack:
            tmp = data[-1] + c
            data[-1] = tmp
      else:
        if '(' in stack:
          if c == 'n' and escape_bit:
            # \n newline
            tmp = data[-1] + '\n'
            data[-1] = tmp
          elif c == 'r' and escape_bit:
            # \r newline
            tmp = data[-1] + '\r'
            data[-1] = tmp
          elif c == 't' and escape_bit:
            # \t tab
            tmp = data[-1] + '\t'
            data[-1] = tmp
          elif c == "'" and escape_bit:
            # \' single-quote
            tmp = data[-1] + "'"
            data[-1] = tmp
          elif c == '"' and escape_bit:
            # \" double-quote
            tmp = data[-1] + '"'
            data[-1] = tmp
          else:
            print('Unknown escape sequence: \\%s' % c)
            tmp = data[-1] + '\\' + c
            data[-1] = tmp

          escape_bit = False # reset escape_bit
        else:
          assert None, 'escape outside of clause?'
        #else:
        #	if c == ',':
        #		data = []
        #		stack = []
        #	else:
        #		print('unknown: end?', c)

  #break

  #
  #outfile.write()
  outfile.flush()

outfile.close()

 

Leave a Reply

Your email address will not be published. Required fields are marked *