# the output file is one .msz file, which has three parts 1) the smzml, which is zstd compressed non-binary xml data 2) following the smzml, is the binary data stored in zstd compressed raw float64 data blocks, each block is approximatedly 100M. This is to ensure non spectrum is splitted in two data blocks. For this reason, we need to store the size of the data block in blk_size[]. Also, we need to store the length of each spectrum for decoding. 3) the tail, which is the information for decoding the data.
# the process is like this: 1) find all binary data positions 2) skip all binary data, read only xml and store them in blocked zstd format 3) decode each binary data and compress into zstd data blocks 4) attach the tail
data_positions=find_all_binary(input_file)# a quick pass to find positions of all binary data
# data_positions are tuples (m, n) of the start and end positions of <binary> and </binary> tag. so we can strip all the base64 encoded data inbetween to create a smzml file (file exactly the same as mzML but without binary data)
fp_in*=open(input_file)
fp_out*=open(output_file)
memory_data=malloc(100M)# buffer to minimize random disk write, may not be needed for SSD
# pattern_detect function read the first a few hundred lines of a mzML file to find the number encoding format for m/z value and the format for intensity value. Also, find the number of total spectra. See the above function for more details
# write all except the binary data to memory, when reach 100M, compress and dump to disk
data_temp=fp_in.read(0,data_positions[0][0])# read from start to the first <binary>
blk_size[]=int[512]# create 512 empty int array to store the block size
spec_num[]=int[512]# create 512 empty int array to store "how many spectra in each data blk"
last_stored_spec=0
for(i=0;i<total_spec;i++){
data_temp=fp_in.read(data_positions[i][1],data_position[i+1][0])# read from </binary> to next <binary>
# compress with zstd when total memory_data exceed 100M
ifmemory_data.space_left()>len(data_temp)# make sure non spectrum is split in two
memory_data.dump(data_temp)
else
blk_size.append(len(data_temp))# record the size of the block, write it to the end of file later, for zstd decompression
fp_out.write(length_of_each_spec)# store the length for each spectrum
fp_out.write(total_spec)# store the length of length_of_each_spec array
fp_out.write(smzml_end,binary_data_end,mz_fmt,int_fmt,zlib_compression,total_spec,blk_size,spec_num,blk_size2,spec_num2)# write a 2048+x bytes index at the end of the file
}# end of compression()
defdecompression(input_file,output_file){
# decompression goes like this: 1) read from the end the tail, get info about where is the smzml, binary data and the size of each data block 3) re-assemble everything into the original mzML file
fp_in*=open(input_file)
fp_in.seek(SEEK_END-2048bytes)
smzml_end,binary_data_end,mz_fmt,int_fmt,zlib_compression,total_spec,blk_size,spec_num,blk_size2,spec_num2=fp_in.read(2048)# use a struct to read the tail
fp_in.seek(0)
current_smzml_blk=0
data_positions=find_all_binary(smzml_file)
output_file.write(smzml_file[0:data_positions[0][0]])# write the head part of the xml file before the first <binary>
current_binary_blk=0# read first binary block into memory