Commit 3014751f authored by Yu Gao's avatar Yu Gao

Upload New File

parent bde39e55
## MSCompress C code
##### Description of code needed:
```python
def main(input_file, output_file){
if input_file is mzml file then
do compression(input_file, output_file)
else
do decompression(input_file, output_file)
}
def compression(input_file, output_file){
# the output file is one .msz file, which has three parts 1) the smzml, which is zstd compressed non-binary xml data 2) following the smzml, is the binary data stored in zstd compressed raw float64 data blocks, each block is approximatedly 100M. This is to ensure non spectrum is splitted in two data blocks. For this reason, we need to store the size of the data block in blk_size[]. Also, we need to store the length of each spectrum for decoding. 3) the tail, which is the information for decoding the data.
# the process is like this: 1) find all binary data positions 2) skip all binary data, read only xml and store them in blocked zstd format 3) decode each binary data and compress into zstd data blocks 4) attach the tail
data_positions = find_all_binary(input_file) # a quick pass to find positions of all binary data
# data_positions are tuples (m, n) of the start and end positions of <binary> and </binary> tag. so we can strip all the base64 encoded data inbetween to create a smzml file (file exactly the same as mzML but without binary data)
fp_in* = open(input_file)
fp_out* = open(output_file)
memory_data = malloc(100M) # buffer to minimize random disk write, may not be needed for SSD
mz_fmt, int_fmt, zlib_compression, total_spec = pattern_detect(fp_in)
# pattern_detect function read the first a few hundred lines of a mzML file to find the number encoding format for m/z value and the format for intensity value. Also, find the number of total spectra. See the above function for more details
# write all except the binary data to memory, when reach 100M, compress and dump to disk
data_temp = fp_in.read(0, data_positions[0][0]) # read from start to the first <binary>
blk_size[] = int [512] # create 512 empty int array to store the block size
spec_num[] = int [512] # create 512 empty int array to store "how many spectra in each data blk"
last_stored_spec = 0
for (i=0;i<total_spec;i++){
data_temp = fp_in.read(data_positions[i][1], data_position[i+1][0]) # read from </binary> to next <binary>
# compress with zstd when total memory_data exceed 100M
if memory_data.space_left()>len(data_temp) # make sure non spectrum is split in two
memory_data.dump(data_temp)
else
blk_size.append(len(data_temp)) # record the size of the block, write it to the end of file later, for zstd decompression
spec_num.append(i-last_stored_spec)
last_stored_spec = i
fp_out.write(zstd.compress(memory_data.existing_data()))
memory_data.clear()
memory_data.dump(data_temp)
}
fp_out.write(zstd.compress(memory_data.existing_data()))
fp_out.write(fp_in.read(fp_in.SEEK_CUR, fp_in.SEEK_END)) # write all the data from the last spectrum's </binary> to end of file
smzml_end = fp_out.SEEK_CUR # record the end of the zstd compressed smzml in fp_out
memory_data.clear()
# read binary data and decode them with zlib/base64 (use pigz for parallel zlib and https://github.com/aklomp/base64 decoding)
blk_size2[] = int [512] # create 512 empty int array to store the block size
spec_num2[] = int [512] # create 512 empty int array to store "how many spectra in each data blk"
last_stored_spec = 0
length_of_each_spec = int [total_spec] # create total_specx empty int array to store the number of data points in each spectrum, needed for decoding
if zlib_compression == 1:
for (i=0;i<total_spec;i+=num_cores){
for (j=0;j<num_cores;j++){temp_data += fp_in.read(data_positions[i*num_cores+j][0],data_positions[i*num_cores+j][1])}
# for n core system, read n binary data into temp_data, then decompress with pigz.
memory_data = pigz.parallel_decompress(temp_data)
# like above, when memory_data exceed 100M, dump with zstd to fp_out, if not, continue to append
blk_size2.append(len(data_temp))
spec_num2.append(i-last_stored_spec)
last_stored_spec = i
fp_out.write(memory_data)
length_of_each_spec.append(len(data)) # may need a loop to store the length of each spectrum for n core system
}
else # if not zlib compressed, just base64 decode:
for (i=0;i<len(data_positions);i++){
data_temp = fp_in.read(data_positions[i][0],data_positions[i][1])
memory_data = base64.decode(data_temp)
fp_out.write(memory_data)
length_of_each_spec.append(len(data))
}
binary_data_end = fp_out.SEEK_CUR
fp_out.write(length_of_each_spec) # store the length for each spectrum
fp_out.write(total_spec) # store the length of length_of_each_spec array
fp_out.write(smzml_end, binary_data_end, mz_fmt, int_fmt, zlib_compression, total_spec, blk_size, spec_num, blk_size2, spec_num2) # write a 2048+x bytes index at the end of the file
} # end of compression()
def decompression(input_file, output_file){
# decompression goes like this: 1) read from the end the tail, get info about where is the smzml, binary data and the size of each data block 3) re-assemble everything into the original mzML file
fp_in* = open(input_file)
fp_in.seek(SEEK_END-2048 bytes)
smzml_end, binary_data_end, mz_fmt, int_fmt, zlib_compression, total_spec, blk_size, spec_num, blk_size2, spec_num2 = fp_in.read(2048) # use a struct to read the tail
fp_in.seek(0)
current_smzml_blk = 0
data_positions = find_all_binary(smzml_file)
output_file.write(smzml_file[0:data_positions[0][0]]) # write the head part of the xml file before the first <binary>
current_binary_blk=0 # read first binary block into memory
binary_data_temp = zstd.decompress(fp_in.read(from smzml_end to blk_size2[current_binary_blk])) # unpack binary block
current_binary_blk+=1
spec_in_binary = spec_num2[current_blk]
decoded_binary_spec_count = 0
for (j=0;j<len(blk_size);j++){ # loop through each block
smzml_file = zstd.decompress(fp_in.read(blk_size[j])) # decompress the xml block
for (i=0;i<spec_num[j];i++){ # loop through each spectrum in each smzml block
if decoded_binary_spec_count < spec_in_binary: # if the current spectrum is still in the current binary blk
output_file.write(smzml for this spectrum)
output_file.write(binary for this spectrum)
decoded_binary_spec_count +=1
else:
binary_data_temp = zstd.decompress(fp_in.read(from smzml_end to blk_size2[current_binary_blk])) # unpack next block
current_binary_blk+=1
spec_in_binary = spec_num2[current_blk]
decoded_binary_spec_count = 0
}
output_file.write(last part of the smzml)
} # end of decompression
```
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment