Techniques - Data processing scripts

A basic Python illustration

A very simple illustration (extract_from_katrin.py):
#!/usr/bin/python

# Syntax:
#   extract_from_katrin.py outputfile inputfiles
# e.g.
#   extract_from_katrin.py output.txt input*.txt
#
# Purpose: find data located a certain number lines below particular literal strings (literal1 and literal2, below), in multiple files, and send the data to a single output file.
# Techniques illustrated: basic command-line argument processing; expanding file wildcards; file I/O; searching.

import sys, glob, re

def open_file(file_name, mode):
    """Open a file."""
    try:
        the_file = open(file_name, mode)
    except(IOError), e:
        print "Unable to open the file", file_name, "Ending program.\n", e
        raw_input("\n\nPress the enter key to exit.")
        sys.exit()
    else:
        return the_file

############################

if len(sys.argv) < 3: # the program name is one of these
    sys.exit("Syntax: extract_from_katrin.py outputfile inputfile(s)\n"
             "For example: extract_from_katrin.py output.csv input*.txt")

# 1. Which output file?
outputfile = sys.argv[1]
print "Output file:", outputfile

# 2. Which input files?
inputspecs = sys.argv[2:] # Now, UNIX will pre-expand wildcards; Windows won't. So we manually glob them out in case we're running under Windows.
# inputspecs = ["hello", "in*.txt"] # just to test manually
print "Input filespec(s):", inputspecs
filelist = list() # an empty list
for s in inputspecs:
    filelist += glob.glob(s)
filelist = list(set(filelist)) # make unique: set() drops non-unique elements
print "Files to be processed:", filelist

# 3. Go
print "Processing..."
outfile = open_file(outputfile, 'w') # open for writing
# write CSV header
outfile.write("DateTimeCode,Filename,Subject,Res_left,Res_right,Rew_left,Rew_right,Taken_left,Taken_right,Miss_left,Miss_right,LLat_left,LLat_right,RewTLat_left,RewTLat_right,gRewTLat,RetrLat_left,RetrLat_right,gRetrLat,Sound_left,Sound_right,STLat_left,STLat_right,gSTLat,SRetLat_left,SRetLat_right,gSRetLat\n")
literal1="Res left, Res right, Rew left, Rew right, Taken left, Taken right, Miss left, Miss right, LLat left, LLat right, RewTLat left, RewTLat right, gRewTLat, RetrLat left, RetrLat right, gRetrLat, Sound left, Sound right, STLat left, STLat right, gSTLat, SRetLat left, SRetLat right, gSRetLat" # copied from sample. Our first line of interest is TWO LINES BELOW this.
literal2="DateTimeCode,Filename,Subject" # copied from sample. Our first line of interest is ONE LINE BELOW this.
for filename in filelist:
    infile = open_file(filename, 'r') # open for reading
    lines = infile.readlines() # read it all in. Creates a list (as always, numbered from 0) with one entry per line.
    success1 = False
    success2 = False
    for linenum in xrange(len(lines)): # iterates from 0 to length-1 automatically
        if re.search(literal1, lines[linenum], re.IGNORECASE | re.MULTILINE):
            dataline1 = lines[linenum+2].strip().replace(' ', '') # the line two down, with leading/trailing whitespace removed, then all spaces removed
            success1 = True
            break # out of surrounding if
    for linenum in xrange(len(lines)):
        if re.search(literal2, lines[linenum], re.IGNORECASE | re.MULTILINE):
            dataline2 = lines[linenum+1].strip().replace(' ', '') # the line one down, similarly processed
            success2 = True
            break
    if success1 and success2:
        outfile.write(dataline2 + "," + dataline1 + "\n")
        print "Successfully processed file:", filename
    else:
        print "Failed to find relevant data in file:", filename
    infile.close()
outfile.close()
print "Finished."