#!/usr/bin/env python3 # Roger Volden import sys def oldReader(inFile): '''Takes a filename and returns a list of headers and sequences''' headers, sequences = [], [] for line in open(inFile): line = line.rstrip() if line == '': continue if line[0] == '>': headers.append(line[1:]) else: sequences.append(line) return headers, sequences def readFasta(inFile): '''Take a filename and return a read dictionary readDict = {header:sequence, ...} ''' readDict = {} for line in open(inFile): line = line.rstrip() if line == '': continue if line[0] == '>': readDict[line[1:]] = '' lastHead = line[1:] else: readDict[lastHead] += line return readDict def readGenome(inFile): ''' Joining a list of strings is faster than repeatedly concatenating strings. This is especially apparent when reading in a genome, where the readFasta method in this program will be too slow for technical reasons. ''' readDict = {} for line in open(inFile): line = line.rstrip() if not line: continue if line.startswith('>'): # this will only happen if sequences have been read # before. It'll turn the list of seqs into a string. if readDict: readDict[lastHead] = ''.join(readDict[lastHead]) # header:empty list to put sequences into readDict[line[1:]] = [] lastHead = line[1:] else: # append that list if you aren't on a header line readDict[lastHead].append(line.upper()) # covers the last sequence read readDict[lastHead] = ''.join(readDict[lastHead]) return readDict def main(): print(sys.argv) reads = readGenome(sys.argv[1]) print(reads) # reads = readFasta(sys.argv[1]) # print(reads['chrI'][100000:100100]) # for h, s in reads.items(): # print(h, len(s)) main()