updated faster splitter

4ebff29b · yugao@uic.edu · 97e8cad5 · 4ebff29b · 4ebff29b · 4ebff29b
Commit 4ebff29b authored Sep 24, 2021 by yugao@uic.edu
Hide whitespace changes
Inline Side-by-side

Showing with 143 additions and 4 deletions

mzml_reader.py mzml_reader.py +12 -3

mzml_splitter.py mzml_splitter.py +130 -0

tester.py tester.py +1 -1

No files found.
--- a/mzml_reader.py
+++ b/mzml_reader.py
@@ -4,6 +4,7 @@ import zlib
 import base64
 import numpy as np
 import time
+import os

 np_dtype_numtype = {'i': np.int32, 'e': np.single, 'f': np.float32, 'q': np.int64, 'd': np.float64}

@@ -76,8 +77,8 @@ def mzml_splitter(mzml_file: str):
    mzml_out_fp = open(mzml_out, 'w', newline='\n', encoding='utf-8')

    # Create empty list for data temporary storage
-    data_position = []
-
+    data_format={}
+    data_position=[]
    # Iterate mzML file and get all data into data_position[]
    for event, elem in iter(ET.iterparse(mzml_file, events=('end',))):
        if elem.tag.endswith('}cvParam'):
@@ -89,10 +90,11 @@ def mzml_splitter(mzml_file: str):
                data_type_dict['data_type'] = accession_dict[elem.get('accession')]

        elif elem.tag.endswith('}binary'):
-            number_array = base64_decoder(elem.text, data_type_dict['data_encoding'], data_type_dict['data_compression'], data_type_dict['data_type'])
+            number_array = base64_lossy_decoder(elem.text, data_type_dict['data_encoding'], data_type_dict['data_compression'], data_type_dict['data_type'])
            data_type_dict['data_number']=str(len(number_array))
            data_position.append(([data_type_dict['data_number'], data_type_dict['data_encoding'], data_type_dict['data_compression'], data_type_dict['data_type']], number_array))

+
    # Split mzml into smzml, binary mass and binary int
    with open(mzml_file, 'r', encoding='utf-8') as mzml_in_fp:
        i=0
@@ -193,3 +195,10 @@ def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file):
    mass_in_fp.close()
    int_in_fp.close()
    smzml_file.close()
+
+
+if __name__ == '__main__':
+    file_name =r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
+    start=time.time()
+    print(os.path.getsize(file_name)/1e9,"GB")
+    print(time.time()-start)
\ No newline at end of file
--- a/mzml_splitter.py
+++ b/mzml_splitter.py
+# mscompress-pub.mzml_splitter created by bathy at 9/24/2021
+import mmap
+import os
+import numpy as np
+import time
+import base64
+import zlib
+from xml.etree import cElementTree as ET
+
+np_dtype_numtype = {'i': np.int32, 'e': np.single, 'f': np.float32, 'q': np.int64, 'd': np.float64}
+
+accession_dict = {"MS:1000519": "32i",
+                  "MS:1000520": "16e",
+                  "MS:1000521": "32f",
+                  "MS:1000522": "64i",
+                  "MS:1000523": "64d",
+                  "MS:1000574": "zlib",
+                  "MS:1000576": "no compression",
+                  "MS:1000515": "intensity",
+                  "MS:1000514": "mass"}
+
+
+
+
+def find_string(file_name, match_tag_start, match_tag_end):
+    start_time=time.time()
+    data_positions=[]
+    with open(file_name, 'r', encoding='utf-8') as f:
+        # memory-map the file, size 0 means whole file
+        m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+        # prot argument is *nix only
+        while True:
+            start = m.find(match_tag_start.encode('utf-8'))
+            if start==-1:
+                print("total time used to position all b64 data:",time.time()-start_time)
+                return data_positions
+            end = m.find(match_tag_end.encode('utf-8'))
+            data_positions.append((start+len(match_tag_start), end))
+            m.seek(end+len(match_tag_end))
+
+
+def base64_decoder(base64_data:bytes, number_fmt, compress_method, array_type):
+    if base64_data is not None:
+        num_type = np_dtype_numtype[number_fmt[-1]]
+        decode_base64 = base64.decodebytes(base64_data)
+        if compress_method == 'zlib':
+            decode_base64 = zlib.decompress(decode_base64)
+        data = np.frombuffer(decode_base64, dtype=num_type)
+        #if array_type == 'intensity':
+        #    data = np.log2(np.where(data>0.00001, data, 0.00001)/np.linalg.norm(data))  # performs log only on intensity
+    else:
+        data = np.array([])
+    return data
+
+
+def mzml_splitter(mzml_file: str):
+    start=time.time()
+    # Generate file names
+    int_binary_file = mzml_file.replace('.mzML', '.bint')
+    mass_binary_file = mzml_file.replace('.mzML', '.bmass')
+    mzml_out = mzml_file.replace('.mzML', '.smzml')
+
+    # Open file pointers
+    mass_binary_out_fp = open(mass_binary_file, 'wb')
+    int_binary_out_fp = open(int_binary_file, 'wb')
+    mzml_out_fp = open(mzml_out, 'w', newline='\n', encoding='utf-8')
+
+    data_format={}
+
+    # Iterate mzML file and get all data into data_position[]
+    current_encoding = '32f'
+    current_compress = 'no compression'
+    for event, elem in iter(ET.iterparse(mzml_file, events=('start',))):
+        if elem.tag.endswith('}cvParam'):
+            if elem.get('accession').endswith(('MS:1000521', 'MS:1000522', 'MS:1000523', 'MS:1000519', 'MS:1000520')):  # retrieves the datatype based on MS accession
+                current_encoding = accession_dict[elem.get('accession')]
+            if elem.get('accession').endswith(('MS:1000576', 'MS:1000574')):  # retrieves the compression based on MS accession
+                current_compress = accession_dict[elem.get('accession')]
+            if elem.get('accession').endswith(('MS:1000515', 'MS:1000514')):  # retrives array_type
+                current_type = accession_dict[elem.get('accession')]
+                data_format[current_type]={'data_encoding': current_encoding,'data_compression': current_compress,'data_type': current_type}
+
+        elif elem.tag.endswith('}spectrumList'):
+            spec_no=elem.get('count')
+            print(spec_no*2)
+
+        elif len(data_format.keys())==2:
+            print(data_format)
+            break
+
+    data_position=find_string(file_name,'<binary>', '</binary>')
+
+    with open(mzml_file, 'r', encoding='utf-8') as mzml_in_fp:
+        m = mmap.mmap(mzml_in_fp.fileno(), 0, access=mmap.ACCESS_READ)
+        i=0
+        for line in mzml_in_fp:
+            if line.lstrip().startswith('<binary>'):
+                mzml_out_fp.write(line.split('<binary>')[0])
+                if i%2==0:
+                    fmt = data_format['mass']
+                else:
+                    fmt = data_format['intensity']
+                m.seek(data_position[i][0])
+                b64_content_byte=m.read(data_position[i][1]-data_position[i][0])
+                number_array = base64_decoder(b64_content_byte, fmt['data_encoding'], fmt['data_compression'], fmt['data_type'])
+                mzml_out_fp.write("<binary>$%s$</binary>\n" % '$'.join([str(len(number_array)), fmt['data_encoding'], fmt['data_compression'], fmt['data_type']]))
+                number_fmt = np_dtype_numtype[fmt['data_encoding'][-1]]
+                if fmt['data_type']=='mass':
+                    mass_binary_out_fp.write(number_array.astype(number_fmt))
+                else:
+                    int_binary_out_fp.write(number_array.astype(number_fmt))
+                i+=1
+
+            else:
+                mzml_out_fp.write(line)
+
+
+    # Close file pointers
+    mzml_out_fp.close()
+    int_binary_out_fp.close()
+    mass_binary_out_fp.close()
+
+
+if __name__ == '__main__':
+    file_name =r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
+    start=time.time()
+    print(os.path.getsize(file_name)/1e9,"GB")
+    print(mzml_splitter(file_name))
+    print(time.time()-start)
+
--- a/tester.py
+++ b/tester.py
 # mscompress-pub.tester created by bathy at 9/23/2021
 import mzml_reader as mzr

-mzml_file = r'K:\test\32bit_uncompressed_centroid\fusion_20200116_qjl_YLD_19.mzML'
+mzml_file = r'K:\test\64bit_zlib_profile\fusion_20200116_qjl_YLD_19.mzML'
 smzml_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.smzml'
 bmass_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bmass'
 bint_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bint'