Crossing a line of a text file using python

I am trying to create genetic signatures. I have a text file full of DNA sequences. I want to read in each line from a text file. Then add 4mers that contain 4 bases in the dictionary. For example: Example sequence

ATGATATATCTATCAT

What I want to add is ATGA, TGAT, GATA, etc. into a dictionary with an identifier that only increases by 1 when 4mers are added.

So the dictionary will contain ...

Genetic signatures, ID
ATGA,1
TGAT, 2
GATA,3

Here is what I still have ...

import sys  

def main ():
    readingFile = open("signatures.txt", "r")
    my_DNA=""

    DNAseq = {} #creates dictionary 

    for char in readingFile:
        my_DNA = my_DNA+char

    for char in my_DNA:             
        index = 0
        DnaID=1
        seq = my_DNA[index:index+4]         

        if (DNAseq.has_key(seq)): #checks if the key is in the dictionary
            index= index +1
        else :
            DNAseq[seq] = DnaID
            index = index+1
            DnaID= DnaID+1

    readingFile.close()

if __name__ == '__main__':
    main()

Here is my conclusion:

ACTC
ACTC
ACTC
ACTC
ACTC
ACTC

This conclusion suggests that it does not iterate through each character in the string ... please help!

+5
source share
5 answers

index DnaID , reset :

index = 0
DnaID=1
for char in my_DNA:             
    #... rest of loop here

, :

ATGA 1
TGAT 2
GATA 3
ATAT 4
TATA 5
ATAT 6
TATC 6
ATCT 7
TCTA 8
CTAT 9
TATC 10
ATCA 10
TCAT 11
CAT 12
AT 13
T 14

, , :

for i in range(len(my_DNA)-3):
    #... rest of loop here

3 , :

ATGA 1
TGAT 2
GATA 3
ATAT 4
TATA 5
ATAT 6
TATC 6
ATCT 7
TCTA 8
CTAT 9
TATC 10
ATCA 10
TCAT 11
+6

.

from collections import defaultdict

readingFile = open("signatures.txt", "r").read()
DNAseq      = defaultdict(int)
window      = 4

for i in xrange(len(readingFile)):
    current_4mer = readingFile[i:i+window]
    if len(current_4mer) == window:
        DNAseq[current_4mer] += 1

print DNAseq
+2

index reset to 0 , for char in my_DNA:.

, , - while index < len(my_DNA)-4: .

+1

reset , for.

? :

readingFile = open("signatures.txt", "r")
my_DNA=""

DNAseq = {} #creates dictionary 

for line in readingFile:    
    line = line.strip()
    my_DNA = my_DNA + line

ID = 1
index = 0
while True:

    try:
        seq = my_DNA[index:index+4]
        if not seq in my_DNA:
            DNAseq[ID] = my_DNA[index:index+4]
        index += 4
        ID += 1
    except IndexError:
        break

readingFile.close()

? , , ATGC, ? , {...1:'ATGC', ... 200:'ATGC',...}, ?

+1
source

If I understand correctly, do you think how often each consecutive row of 4 bases occurs? Try the following:

def split_to_4mers(filename):
    dna_dict = {}
    with open(filename, 'r') as f:
        # assuming the first line of the file, only, contains the dna string
        dna_string = f.readline();
        for idx in range(len(dna_string)-3):
            seq = dna_string[idx:idx+4]
            count = dna_dict.get(seq, 0)
            dna_dict[seq] = count+1
    return dna_dict

in a file that contains only "ATGATATATCTATCAT":

{'TGAT': 1, 'ATCT': 1, 'ATGA': 1, 'TCAT': 1, 'TATA': 1, 'TATC': 2, 'CTAT': 1, 'ATCA': 1, 'ATAT': 2, 'GATA': 1, 'TCTA': 1}
0
source

All Articles