[go: nahoru, domu]

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EXP: try out some new manifest methods #2599

Open
wants to merge 8 commits into
base: latest
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add --labels-to to sourmash compare
  • Loading branch information
ctb committed Apr 29, 2023
commit 6c960966c5433d9f75f1a7a86713ab59f6e9c810
4 changes: 4 additions & 0 deletions src/sourmash/cli/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ def subparser(subparsers):
help='write matrix to specified file in CSV format (with column '
'headers)'
)
subparser.add_argument(
'--labels-to', '--labels-save',
help='a CSV file containing label information',
)
subparser.add_argument(
'-p', '--processes', metavar='N', type=int, default=None,
help='Number of processes to use to calculate similarity')
Expand Down
46 changes: 32 additions & 14 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ def compare(args):
loaded = list(loaded)
if not loaded:
notify(f'\nwarning: no signatures loaded at given ksize/molecule type/picklist from {filename}')
siglist.extend(loaded)

# track ksizes/moltypes
# add to siglist; track ksizes/moltypes
for s in loaded:
siglist.append((s, filename))
ksizes.add(s.minhash.ksize)
moltypes.add(sourmash_args.get_moltype(s))

Expand Down Expand Up @@ -96,7 +96,7 @@ def compare(args):

# check to make sure they're potentially compatible - either using
# scaled, or not.
scaled_sigs = [s.minhash.scaled for s in siglist]
scaled_sigs = [s.minhash.scaled for s, _ in siglist]
is_scaled = all(scaled_sigs)
is_scaled_2 = any(scaled_sigs)

Expand Down Expand Up @@ -130,14 +130,14 @@ def compare(args):

# notify about implicit --ignore-abundance:
if is_containment or return_ani:
track_abundances = any(( s.minhash.track_abundance for s in siglist ))
track_abundances = any(( s.minhash.track_abundance for s, _ in siglist ))
if track_abundances:
notify('NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.')

# if using scaled sketches or --scaled, downsample to common max scaled.
printed_scaled_msg = False
if is_scaled:
max_scaled = max(s.minhash.scaled for s in siglist)
max_scaled = max(s.minhash.scaled for s, _ in siglist)
if args.scaled:
args.scaled = int(args.scaled)

Expand All @@ -147,7 +147,7 @@ def compare(args):
notify(f"WARNING: continuing with scaled value of {max_scaled}.")

new_siglist = []
for s in siglist:
for s, filename in siglist:
if not size_may_be_inaccurate and not s.minhash.size_is_accurate():
size_may_be_inaccurate = True
if s.minhash.scaled != max_scaled:
Expand All @@ -156,9 +156,9 @@ def compare(args):
printed_scaled_msg = True
with s.update() as s:
s.minhash = s.minhash.downsample(scaled=max_scaled)
new_siglist.append(s)
new_siglist.append((s, filename))
else:
new_siglist.append(s)
new_siglist.append((s, filename))
siglist = new_siglist
elif args.scaled is not None:
error("ERROR: cannot specify --scaled with non-scaled signatures.")
Expand All @@ -175,15 +175,16 @@ def compare(args):

# do all-by-all calculation

labeltext = [str(item) for item in siglist]
labeltext = [str(ss) for ss, _ in siglist]
sigsonly = [ ss for ss, _ in siglist ]
if args.containment:
similarity = compare_serial_containment(siglist, return_ani=return_ani)
similarity = compare_serial_containment(sigsonly, return_ani=return_ani)
elif args.max_containment:
similarity = compare_serial_max_containment(siglist, return_ani=return_ani)
similarity = compare_serial_max_containment(sigsonly, return_ani=return_ani)
elif args.avg_containment:
similarity = compare_serial_avg_containment(siglist, return_ani=return_ani)
similarity = compare_serial_avg_containment(sigsonly, return_ani=return_ani)
else:
similarity = compare_all_pairs(siglist, args.ignore_abundance,
similarity = compare_all_pairs(sigsonly, args.ignore_abundance,
n_jobs=args.processes, return_ani=return_ani)

# if distance matrix desired, switch to 1-similarity
Expand All @@ -193,7 +194,7 @@ def compare(args):
matrix = similarity

if len(siglist) < 30:
for i, ss in enumerate(siglist):
for i, (ss, filename) in enumerate(siglist):
# for small matrices, pretty-print some output
name_num = '{}-{}'.format(i, str(ss))
if len(name_num) > 20:
Expand All @@ -216,6 +217,23 @@ def compare(args):
with open(args.output, 'wb') as fp:
numpy.save(fp, matrix)

# output labels information via --labels-to?
if args.labels_to:
labeloutname = args.labels_to
notify(f'saving labels to: {labeloutname}')
with sourmash_args.FileOutputCSV(labeloutname) as fp:
w = csv.writer(fp)
w.writerow(['md5', 'label', 'name', 'filename', 'signature_file'])

for ss, location in siglist:
md5 = ss.md5sum()
sigfile = location
label = str(ss)
name = ss.name
filename = ss.filename

w.writerow([md5, label, name, filename, sigfile])

# output CSV?
if args.csv:
with FileOutputCSV(args.csv) as csv_fp:
Expand Down
1 change: 1 addition & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def test_compare_serial(runtmp):

testsigs = utils.get_test_data('genome-s1*.sig')
testsigs = glob.glob(testsigs)
assert len(testsigs) == 4

c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs)

Expand Down