Techniques — Data processing scripts
|
Back to techniques
https://rcardinal.ddns.net/techniques/ubuntu.html
|
A very simple illustration (extract_from_katrin.py):
#!/usr/bin/python # Syntax: # extract_from_katrin.py outputfile inputfiles # e.g. # extract_from_katrin.py output.txt input*.txt # # Purpose: find data located a certain number lines below particular literal strings (literal1 and literal2, below), in multiple files, and send the data to a single output file. # Techniques illustrated: basic command-line argument processing; expanding file wildcards; file I/O; searching. import sys, glob, re def open_file(file_name, mode): """Open a file.""" try: the_file = open(file_name, mode) except(IOError), e: print "Unable to open the file", file_name, "Ending program.\n", e raw_input("\n\nPress the enter key to exit.") sys.exit() else: return the_file ############################ if len(sys.argv) < 3: # the program name is one of these sys.exit("Syntax: extract_from_katrin.py outputfile inputfile(s)\n" "For example: extract_from_katrin.py output.csv input*.txt") # 1. Which output file? outputfile = sys.argv[1] print "Output file:", outputfile # 2. Which input files? inputspecs = sys.argv[2:] # Now, UNIX will pre-expand wildcards; Windows won't. So we manually glob them out in case we're running under Windows. # inputspecs = ["hello", "in*.txt"] # just to test manually print "Input filespec(s):", inputspecs filelist = list() # an empty list for s in inputspecs: filelist += glob.glob(s) filelist = list(set(filelist)) # make unique: set() drops non-unique elements print "Files to be processed:", filelist # 3. Go print "Processing..." outfile = open_file(outputfile, 'w') # open for writing # write CSV header outfile.write("DateTimeCode,Filename,Subject,Res_left,Res_right,Rew_left,Rew_right,Taken_left,Taken_right,Miss_left,Miss_right,LLat_left,LLat_right,RewTLat_left,RewTLat_right,gRewTLat,RetrLat_left,RetrLat_right,gRetrLat,Sound_left,Sound_right,STLat_left,STLat_right,gSTLat,SRetLat_left,SRetLat_right,gSRetLat\n") literal1="Res left, Res right, Rew left, Rew right, Taken left, Taken right, Miss left, Miss right, LLat left, LLat right, RewTLat left, RewTLat right, gRewTLat, RetrLat left, RetrLat right, gRetrLat, Sound left, Sound right, STLat left, STLat right, gSTLat, SRetLat left, SRetLat right, gSRetLat" # copied from sample. Our first line of interest is TWO LINES BELOW this. literal2="DateTimeCode,Filename,Subject" # copied from sample. Our first line of interest is ONE LINE BELOW this. for filename in filelist: infile = open_file(filename, 'r') # open for reading lines = infile.readlines() # read it all in. Creates a list (as always, numbered from 0) with one entry per line. success1 = False success2 = False for linenum in xrange(len(lines)): # iterates from 0 to length-1 automatically if re.search(literal1, lines[linenum], re.IGNORECASE | re.MULTILINE): dataline1 = lines[linenum+2].strip().replace(' ', '') # the line two down, with leading/trailing whitespace removed, then all spaces removed success1 = True break # out of surrounding if for linenum in xrange(len(lines)): if re.search(literal2, lines[linenum], re.IGNORECASE | re.MULTILINE): dataline2 = lines[linenum+1].strip().replace(' ', '') # the line one down, similarly processed success2 = True break if success1 and success2: outfile.write(dataline2 + "," + dataline1 + "\n") print "Successfully processed file:", filename else: print "Failed to find relevant data in file:", filename infile.close() outfile.close() print "Finished."