Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
MSCompress
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Yu Gao
MSCompress
Commits
b4c7bac9
Commit
b4c7bac9
authored
Oct 29, 2021
by
yugao@uic.edu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added zstd, new compression option
parent
ec70fdb5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
47 additions
and
28 deletions
+47
-28
ms_compress_cmd.py
ms_compress_cmd.py
+47
-28
No files found.
ms_compress_cmd.py
View file @
b4c7bac9
...
...
@@ -57,9 +57,10 @@ accession_dict = {"MS:1000519": "32i",
"MS:1000514"
:
"mass"
}
def
find_string
(
mzml_read_fp
,
match_tag_start
,
match_tag_end
,
data_format
,
spec_no
):
def
find_string
(
mzml_read_fp
,
match_tag_start
,
match_tag_end
,
data_format
,
spec_no
,
compression
=
'auto'
):
start_time
=
time
.
time
()
compressor
=
zstandard
.
ZstdCompressor
(
threads
=
2
)
if
compression
==
'zstd'
:
compressor
=
zstandard
.
ZstdCompressor
(
threads
=
2
)
file_name
=
mzml_read_fp
.
name
file_size
=
os
.
path
.
getsize
(
file_name
)
encoded_start_tag
=
match_tag_start
.
encode
(
'utf-8'
)
...
...
@@ -80,7 +81,10 @@ def find_string(mzml_read_fp, match_tag_start, match_tag_end, data_format, spec_
print
(
"total time used to position all b64 data:"
,
time
.
time
()
-
start_time
)
fo
.
write
(
m
.
read
(
file_size
-
m
.
tell
()))
with
open
(
file_name
.
replace
(
'.mzML'
,
'.smzml'
),
'wb'
)
as
write_file
:
write_file
.
write
(
compressor
.
compress
(
smzml_io
.
getvalue
()))
if
compression
==
'zstd'
:
write_file
.
write
(
compressor
.
compress
(
smzml_io
.
getvalue
()))
else
:
write_file
.
write
(
smzml_io
.
getvalue
())
return
data_positions
start
=
m
.
find
(
encoded_start_tag
)
fo
.
write
(
m
.
read
(
start
+
len_start_tag
-
last_end
))
...
...
@@ -97,13 +101,15 @@ def find_string(mzml_read_fp, match_tag_start, match_tag_end, data_format, spec_
i
+=
1
def
mzml_splitter
(
mzml_file
:
str
):
def
mzml_splitter
(
mzml_file
:
str
,
compression
=
'auto'
):
start
=
time
.
time
()
if
compression
==
'zstd'
:
compressor
=
zstandard
.
ZstdCompressor
(
threads
=
2
)
# Generate file names
int_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bint'
)
mass_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bmass'
)
# mzml_out = mzml_file.replace('.mzML', '.smzml')
compressor
=
zstandard
.
ZstdCompressor
(
threads
=
2
)
# Open file pointers
mass_binary_out_fp
=
BytesIO
()
int_binary_out_fp
=
BytesIO
()
...
...
@@ -134,7 +140,7 @@ def mzml_splitter(mzml_file: str):
print
(
"Mass and intensity data format"
,
data_format
)
break
data_position
=
find_string
(
mzml_read_fp
,
'<binary>'
,
'</binary>'
,
data_format
,
spec_no
)
data_position
=
find_string
(
mzml_read_fp
,
'<binary>'
,
'</binary>'
,
data_format
,
spec_no
,
compression
=
compression
)
data_chunks
=
chunks
(
data_position
,
2
)
mass_num_data_list
=
[]
...
...
@@ -159,12 +165,14 @@ def mzml_splitter(mzml_file: str):
mzml_read_fp
.
close
()
int_binary_out_fp_final
=
open
(
int_binary_file
,
'wb'
)
mass_binary_out_fp_final
=
open
(
mass_binary_file
,
'wb'
)
int_binary_out_fp_final
.
write
(
compressor
.
compress
(
int_binary_out_fp
.
getvalue
()))
mass_binary_out_fp_final
.
write
(
compressor
.
compress
(
mass_binary_out_fp
.
getvalue
()))
int_binary_out_fp_final
.
close
()
mass_binary_out_fp_final
.
close
()
with
open
(
int_binary_file
,
'wb'
)
as
int_binary_out_fp_final
,
open
(
mass_binary_file
,
'wb'
)
as
mass_binary_out_fp_final
:
if
compression
==
'zstd'
:
int_binary_out_fp_final
.
write
(
compressor
.
compress
(
int_binary_out_fp
.
getvalue
()))
mass_binary_out_fp_final
.
write
(
compressor
.
compress
(
mass_binary_out_fp
.
getvalue
()))
else
:
int_binary_out_fp_final
.
write
(
int_binary_out_fp
.
getvalue
())
mass_binary_out_fp_final
.
write
(
mass_binary_out_fp
.
getvalue
())
def
mzml_lossy_splitter
(
mzml_file
:
str
):
...
...
@@ -232,20 +240,27 @@ def mzml_lossy_splitter(mzml_file: str):
# mass_binary_out_fp.close()
def
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
mzml_file
):
decompressor
=
zstandard
.
ZstdDecompressor
()
def
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
mzml_file
,
compression
=
'auto'
):
mass_io
=
BytesIO
()
int_io
=
BytesIO
()
smzml_io
=
BytesIO
()
decompressor
.
copy_stream
(
open
(
bmass_file
,
'rb'
),
mass_io
)
decompressor
.
copy_stream
(
open
(
bint_file
,
'rb'
),
int_io
)
decompressor
.
copy_stream
(
open
(
smzml_file
,
'rb'
),
smzml_io
)
if
compression
==
'zstd'
:
decompressor
=
zstandard
.
ZstdDecompressor
()
decompressor
.
copy_stream
(
open
(
bmass_file
,
'rb'
),
mass_io
)
decompressor
.
copy_stream
(
open
(
bint_file
,
'rb'
),
int_io
)
decompressor
.
copy_stream
(
open
(
smzml_file
,
'rb'
),
smzml_io
)
mass_in_fp
=
mass_io
int_in_fp
=
int_io
smzml_file
=
smzml_io
.
getvalue
()
.
decode
(
'utf8'
)
.
splitlines
()
compression_end
=
'
\n
'
else
:
mass_in_fp
=
open
(
bmass_file
,
'rb'
)
int_in_fp
=
open
(
bint_file
,
'rb'
)
smzml_file
=
open
(
smzml_file
,
'r'
,
encoding
=
'utf-8'
)
compression_end
=
''
# Create file pointers
mass_in_fp
=
mass_io
int_in_fp
=
int_io
smzml_file
=
smzml_io
.
getvalue
()
.
decode
(
'utf8'
)
.
splitlines
()
int_in_fp
.
seek
(
-
4
,
os
.
SEEK_END
)
total_spec_no
=
int
.
from_bytes
(
int_in_fp
.
read
(
4
),
byteorder
=
'little'
)
int_in_fp
.
seek
(
-
4
*
(
total_spec_no
+
1
),
os
.
SEEK_END
)
...
...
@@ -271,11 +286,13 @@ def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file):
f_out
.
write
(
'<binary>
%
s</binary>
\n
'
%
base64_encoder
(
number_array
,
data_compression
))
else
:
f_out
.
write
(
line
+
'
\n
'
)
f_out
.
write
(
line
+
compression_end
)
# Close file pointers
mass_in_fp
.
close
()
int_in_fp
.
close
()
if
compression
!=
'zstd'
:
smzml_file
.
close
()
if
__name__
==
'__main__'
:
...
...
@@ -284,7 +301,7 @@ if __name__ == '__main__':
# bmass_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bmass'
# bint_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bint'
# mzml_out = r'J:\mass_cloud\projects_folder\1000111\test\test2.mzML'
start
=
time
.
time
()
__status__
=
"Development"
__version__
=
"0.0.1"
...
...
@@ -331,7 +348,9 @@ if __name__ == '__main__':
optional
.
add_argument
(
"--loss_type"
,
help
=
"lossless or lossy compression"
,
default
=
"lossless"
,
choices
=
(
"lossless"
,
'lossy'
)
)
optional
.
add_argument
(
"--compression"
,
help
=
"compression method, zstandard [zstd], turbopfor [tp], auto [auto]"
,
default
=
"auto"
,
choices
=
(
"zstd"
,
'tp'
,
'auto'
)
)
# parse arguments
args
=
parser
.
parse_args
()
...
...
@@ -349,14 +368,14 @@ if __name__ == '__main__':
smzml
=
mzml_input
.
replace
(
'.mzML'
,
'.smzml'
)
if
args
.
loss_type
==
'lossless'
:
mzml_splitter
(
mzml_input
)
mzml_splitter
(
mzml_input
,
compression
=
args
.
compression
)
elif
args
.
loss_type
==
'lossy'
:
mzml_lossy_splitter
(
mzml_input
)
mzml_lossy_splitter
(
mzml_input
,
compression
=
args
.
compression
)
else
:
raise
ValueError
(
"only lossless and lossy compression could be performed"
)
mzml_decoder
(
smzml
,
mass_binary_file
,
int_binary_file
,
mzs_output
)
mzml_decoder
(
smzml
,
mass_binary_file
,
int_binary_file
,
mzs_output
,
compression
=
args
.
compression
)
print
(
time
.
time
()
-
start
)
# import pandas as pd
# compressed_time = 'D:/data/mscompress/testfile_decompresstime.xlsx'
# df = pd.read_excel(compressed_time,index_col=0)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment