Commit b4c7bac9 authored by yugao@uic.edu's avatar yugao@uic.edu

added zstd, new compression option

parent ec70fdb5
......@@ -57,9 +57,10 @@ accession_dict = {"MS:1000519": "32i",
"MS:1000514": "mass"}
def find_string(mzml_read_fp, match_tag_start, match_tag_end, data_format, spec_no):
def find_string(mzml_read_fp, match_tag_start, match_tag_end, data_format, spec_no, compression='auto'):
start_time = time.time()
compressor = zstandard.ZstdCompressor(threads=2)
if compression=='zstd':
compressor = zstandard.ZstdCompressor(threads=2)
file_name = mzml_read_fp.name
file_size = os.path.getsize(file_name)
encoded_start_tag = match_tag_start.encode('utf-8')
......@@ -80,7 +81,10 @@ def find_string(mzml_read_fp, match_tag_start, match_tag_end, data_format, spec_
print("total time used to position all b64 data:", time.time() - start_time)
fo.write(m.read(file_size - m.tell()))
with open(file_name.replace('.mzML', '.smzml'), 'wb') as write_file:
write_file.write(compressor.compress(smzml_io.getvalue()))
if compression=='zstd':
write_file.write(compressor.compress(smzml_io.getvalue()))
else:
write_file.write(smzml_io.getvalue())
return data_positions
start = m.find(encoded_start_tag)
fo.write(m.read(start + len_start_tag - last_end))
......@@ -97,13 +101,15 @@ def find_string(mzml_read_fp, match_tag_start, match_tag_end, data_format, spec_
i += 1
def mzml_splitter(mzml_file: str):
def mzml_splitter(mzml_file: str, compression='auto'):
start = time.time()
if compression == 'zstd':
compressor = zstandard.ZstdCompressor(threads=2)
# Generate file names
int_binary_file = mzml_file.replace('.mzML', '.bint')
mass_binary_file = mzml_file.replace('.mzML', '.bmass')
# mzml_out = mzml_file.replace('.mzML', '.smzml')
compressor = zstandard.ZstdCompressor(threads=2)
# Open file pointers
mass_binary_out_fp = BytesIO()
int_binary_out_fp = BytesIO()
......@@ -134,7 +140,7 @@ def mzml_splitter(mzml_file: str):
print("Mass and intensity data format", data_format)
break
data_position = find_string(mzml_read_fp, '<binary>', '</binary>', data_format, spec_no)
data_position = find_string(mzml_read_fp, '<binary>', '</binary>', data_format, spec_no, compression=compression)
data_chunks = chunks(data_position, 2)
mass_num_data_list = []
......@@ -159,12 +165,14 @@ def mzml_splitter(mzml_file: str):
mzml_read_fp.close()
int_binary_out_fp_final = open(int_binary_file, 'wb')
mass_binary_out_fp_final = open(mass_binary_file, 'wb')
int_binary_out_fp_final.write(compressor.compress(int_binary_out_fp.getvalue()))
mass_binary_out_fp_final.write(compressor.compress(mass_binary_out_fp.getvalue()))
int_binary_out_fp_final.close()
mass_binary_out_fp_final.close()
with open(int_binary_file, 'wb') as int_binary_out_fp_final, open(mass_binary_file, 'wb') as mass_binary_out_fp_final:
if compression == 'zstd':
int_binary_out_fp_final.write(compressor.compress(int_binary_out_fp.getvalue()))
mass_binary_out_fp_final.write(compressor.compress(mass_binary_out_fp.getvalue()))
else:
int_binary_out_fp_final.write(int_binary_out_fp.getvalue())
mass_binary_out_fp_final.write(mass_binary_out_fp.getvalue())
def mzml_lossy_splitter(mzml_file: str):
......@@ -232,20 +240,27 @@ def mzml_lossy_splitter(mzml_file: str):
# mass_binary_out_fp.close()
def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file):
decompressor = zstandard.ZstdDecompressor()
def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file, compression='auto'):
mass_io = BytesIO()
int_io = BytesIO()
smzml_io = BytesIO()
decompressor.copy_stream(open(bmass_file, 'rb'), mass_io)
decompressor.copy_stream(open(bint_file, 'rb'), int_io)
decompressor.copy_stream(open(smzml_file, 'rb'), smzml_io)
if compression=='zstd':
decompressor = zstandard.ZstdDecompressor()
decompressor.copy_stream(open(bmass_file, 'rb'), mass_io)
decompressor.copy_stream(open(bint_file, 'rb'), int_io)
decompressor.copy_stream(open(smzml_file, 'rb'), smzml_io)
mass_in_fp = mass_io
int_in_fp = int_io
smzml_file = smzml_io.getvalue().decode('utf8').splitlines()
compression_end = '\n'
else:
mass_in_fp = open(bmass_file, 'rb')
int_in_fp = open(bint_file, 'rb')
smzml_file = open(smzml_file, 'r', encoding='utf-8')
compression_end = ''
# Create file pointers
mass_in_fp = mass_io
int_in_fp = int_io
smzml_file = smzml_io.getvalue().decode('utf8').splitlines()
int_in_fp.seek(-4, os.SEEK_END)
total_spec_no = int.from_bytes(int_in_fp.read(4), byteorder='little')
int_in_fp.seek(-4 * (total_spec_no + 1), os.SEEK_END)
......@@ -271,11 +286,13 @@ def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file):
f_out.write('<binary>%s</binary>\n' % base64_encoder(number_array, data_compression))
else:
f_out.write(line+'\n')
f_out.write(line+compression_end)
# Close file pointers
mass_in_fp.close()
int_in_fp.close()
if compression != 'zstd':
smzml_file.close()
if __name__ == '__main__':
......@@ -284,7 +301,7 @@ if __name__ == '__main__':
# bmass_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bmass'
# bint_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bint'
# mzml_out = r'J:\mass_cloud\projects_folder\1000111\test\test2.mzML'
start = time.time()
__status__ = "Development"
__version__ = "0.0.1"
......@@ -331,7 +348,9 @@ if __name__ == '__main__':
optional.add_argument(
"--loss_type", help="lossless or lossy compression", default="lossless", choices=("lossless", 'lossy')
)
optional.add_argument(
"--compression", help="compression method, zstandard [zstd], turbopfor [tp], auto [auto]", default="auto", choices=("zstd", 'tp', 'auto')
)
# parse arguments
args = parser.parse_args()
......@@ -349,14 +368,14 @@ if __name__ == '__main__':
smzml = mzml_input.replace('.mzML', '.smzml')
if args.loss_type == 'lossless':
mzml_splitter(mzml_input)
mzml_splitter(mzml_input, compression=args.compression)
elif args.loss_type == 'lossy':
mzml_lossy_splitter(mzml_input)
mzml_lossy_splitter(mzml_input, compression=args.compression)
else:
raise ValueError("only lossless and lossy compression could be performed")
mzml_decoder(smzml, mass_binary_file, int_binary_file, mzs_output)
mzml_decoder(smzml, mass_binary_file, int_binary_file, mzs_output, compression=args.compression)
print(time.time()-start)
# import pandas as pd
# compressed_time = 'D:/data/mscompress/testfile_decompresstime.xlsx'
# df = pd.read_excel(compressed_time,index_col=0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment