[go: nahoru, domu]

Skip to content

Commit

Permalink
bim2vcf
Browse files Browse the repository at this point in the history
  • Loading branch information
lindenb committed Jan 15, 2020
1 parent 50ca0ae commit c81f95b
Show file tree
Hide file tree
Showing 8 changed files with 277 additions and 53 deletions.
13 changes: 13 additions & 0 deletions docs/BimToVcf.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ convert a .bim to a .vcf . For @FlorianeS44
```
Usage: bim2vcf [options] Files
Options:
--bcf-output
If this program writes a VCF to a file, The format is first guessed from
the file suffix. Otherwise, force BCF output. The current supported BCF
version is : 2.1 which is not compatible with bcftools/htslib (last
checked 2019-11-15)
Default: false
--generate-vcf-md5
Generate MD5 checksum for VCF output.
Default: false
-h, --help
print help and exit
--helpFormat
Expand Down Expand Up @@ -52,6 +61,10 @@ The java jar file will be installed in the `dist` directory.

[https://github.com/lindenb/jvarkit/tree/master/src/main/java/com/github/lindenb/jvarkit/tools/misc/BimToVcf.java](https://github.com/lindenb/jvarkit/tree/master/src/main/java/com/github/lindenb/jvarkit/tools/misc/BimToVcf.java)

### Unit Tests

[https://github.com/lindenb/jvarkit/tree/master/src/test/java/com/github/lindenb/jvarkit/tools/misc/BimToVcfTest.java](https://github.com/lindenb/jvarkit/tree/master/src/test/java/com/github/lindenb/jvarkit/tools/misc/BimToVcfTest.java)


## Contribute

Expand Down
6 changes: 3 additions & 3 deletions docs/VcfToBed.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ Usage: vcf2bed [options] Files
indexed with 'picard CreateSequenceDictionary', or any hts file
containing a dictionary (VCF, BAM, CRAM, intervals...)
-x, --slop
Extends interval by 'x' bases on both sides. A distance specified as a
positive integer.Commas are removed. The following suffixes are
interpreted : b,bp,k,kb,m,mb
Extends interval. Extending interval. The following syntaxes are
supported: 1000; 1kb; 1,000; 30%(shrink); 150% (extend); 0.5 (shrink);
1.5 (extend)
Default: 0
--version
print version and exit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ of this software and associated documentation files (the "Software"), to deal
* Utility to extend Intervals
*/
public interface IntervalExtender extends Function<Locatable,SimpleInterval> {
public static final String OPT_DESC="Extending interval.";
public static final String OPT_DESC="Extending interval. The following syntaxes are supported: 1000; 1kb; 1,000; 30%(shrink); 150% (extend); 0.5 (shrink); 1.5 (extend)";

public static class StringConverter implements IStringConverter<IntervalExtender> {
@Override
public IntervalExtender convert(String arg0) {
public IntervalExtender convert(final String arg0) {
return IntervalExtender.of(arg0);
}
}
Expand Down Expand Up @@ -71,10 +71,17 @@ public static IntervalExtender of(final SAMSequenceDictionary dict,final double
public static IntervalExtender of(final String decode) {
return of(null,decode);
}
/** test whether interval will be changed : fraction is !=1.0 or number of bases is !=0*/
public boolean isChanging();

/** test whether interval will be reduced : fraction is < 1.0 of number of base is <0*/

/** test whether interval will be reduced : fraction is < 1.0 of number of bases is <0*/
public boolean isShriking();

/** test whether interval will be reduced : fraction is < 1.0 of number of bases is <0*/
public boolean isExtendingByFraction();


/** using a samsequence dictionary */
public static IntervalExtender of(final SAMSequenceDictionary dict,String decode) {
decode= decode.trim();
Expand All @@ -86,37 +93,40 @@ public static IntervalExtender of(final SAMSequenceDictionary dict,String decode
return of(dict,fraction);
} else
{
int factor = 1;
// DistanceParser don't handle negative distances
if(decode.startsWith("-")) {
factor=-1;
decode=decode.substring(1);
}
final int xtend = new DistanceParser().applyAsInt(decode);
return of(dict,xtend);
return of(dict,xtend * factor);
}

}


static abstract class AbstractExtender implements IntervalExtender {
private final SAMSequenceDictionary dict;
protected AbstractExtender(final SAMSequenceDictionary dict) {
this.dict = dict;
}
protected SimpleInterval extend(final Locatable t,final int n) {
final int mid = (int)(((long)t.getStart()+(long)t.getEnd())/2L);
int beg = mid - n; /* not /2 : headeach if n==1 */
int end = mid + n;
protected SimpleInterval extend(final String contig,int beg,int end) {
// shrinking
if(beg>end) {
final int mid = (int)(((long)beg+(long)end)/2L);
beg = mid;
end = mid;
}
if(this.dict!=null) {
final SAMSequenceRecord ssr = this.dict.getSequence(t.getContig());
if(ssr==null) throw new JvarkitException.ContigNotFoundInDictionary(t.getContig(), this.dict);
if(t.getStart()> ssr.getSequenceLength()) {
throw new IllegalArgumentException("in "+t+". start is greater than contig length "+ssr.getSequenceLength());
final SAMSequenceRecord ssr = this.dict.getSequence(contig);
if(ssr==null) throw new JvarkitException.ContigNotFoundInDictionary(contig, this.dict);
if(beg> ssr.getSequenceLength()) {
throw new IllegalArgumentException("in "+contig+". start is greater than contig length "+ssr.getSequenceLength());
}
beg = Math.min(ssr.getSequenceLength(), Math.max(1, beg));
end = Math.min(ssr.getSequenceLength(), Math.max(1, end));
}
return new SimpleInterval(t.getContig(), beg, end);
return new SimpleInterval(contig, beg, end);
}
}

Expand All @@ -125,18 +135,33 @@ static class ExtendByFraction extends AbstractExtender {
final double fract;
ExtendByFraction(final SAMSequenceDictionary dict,double fract) {
super(dict);
if(fract<0.0) throw new IllegalArgumentException("negative fractions are not allowed: "+fract);
this.fract = fract;
}
@Override
public boolean isShriking() {
return fract < 1.0;
}
@Override
public boolean isExtendingByFraction() {
return true;
}
@Override
public SimpleInterval apply(final Locatable t) {
final int oldLen = t.getLengthOnReference();
final int newLen = (int)Math.ceil(oldLen* this.fract);
return extend(t,newLen - oldLen);
final int mid = (int)(((long)t.getStart()+(long)t.getEnd())/2L);
final int beg = mid - newLen/2;
final int end = beg + newLen -1;

return extend(t.getContig(),beg,end);
}

@Override
public boolean isChanging() {
return fract!=1.0;
}

@Override
public String toString() {
return getClass().getName()+" fraction : "+this.fract;
Expand All @@ -150,13 +175,23 @@ static class ExtendByDistance extends AbstractExtender {
this.xtend = xtend;
}
@Override
public boolean isChanging() {
return xtend!=0;
}

@Override
public boolean isShriking() {
return xtend < 0;
}

@Override
public boolean isExtendingByFraction() {
return false;
}

@Override
public SimpleInterval apply(final Locatable t) {
return extend(t,this.xtend);
return extend(t.getContig(),t.getStart()-this.xtend,t.getEnd()+this.xtend);
}
@Override
public String toString() {
Expand Down
54 changes: 36 additions & 18 deletions src/main/java/com/github/lindenb/jvarkit/tools/misc/BimToVcf.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,12 @@ of this software and associated documentation files (the "Software"), to deal
package com.github.lindenb.jvarkit.tools.misc;

import java.io.BufferedReader;
import java.io.File;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.function.Predicate;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
Expand All @@ -52,6 +51,9 @@ of this software and associated documentation files (the "Software"), to deal
import htsjdk.variant.vcf.VCFStandardHeaderLines;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParametersDelegate;
import com.github.lindenb.jvarkit.lang.CharSplitter;
import com.github.lindenb.jvarkit.util.bio.AcidNucleics;
import com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils;
import com.github.lindenb.jvarkit.util.jcommander.Launcher;
import com.github.lindenb.jvarkit.util.jcommander.Program;
Expand Down Expand Up @@ -95,23 +97,27 @@ of this software and associated documentation files (the "Software"), to deal
END_DOC
*/
import com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate;


@Program(name="bim2vcf",
description="convert a .bim to a .vcf . For @FlorianeS44",
keywords= {"bim","vcf"},
modificationDate="20190926"
modificationDate="20200115"
)
public class BimToVcf extends Launcher
{
private static final Logger LOG = Logger.build(BimToVcf.class).make();

@Parameter(names={"-o","--output"},description=OPT_OUPUT_FILE_OR_STDOUT)
private File outputFile = null;
private Path outputFile = null;

@Parameter(names={"-R","--reference"},description=INDEXED_FASTA_REFERENCE_DESCRIPTION,required=true)
private Path faidx = null;

@ParametersDelegate
private WritingVariantsDelegate writingVariants = new WritingVariantsDelegate();

public BimToVcf() {
}

Expand All @@ -127,6 +133,7 @@ public int doWork(final List<String> args) {

final SAMSequenceDictionary dict=SequenceDictionaryUtils.extractRequired(faidx);

this.writingVariants.dictionary(dict);

r = super.openBufferedReader(oneFileOrNull(args));

Expand All @@ -141,13 +148,14 @@ public int doWork(final List<String> args) {
final List<String> genotypeSampleNames = Collections.emptyList();
final VCFHeader header=new VCFHeader(headerLines, genotypeSampleNames);
header.setSequenceDictionary(dict);
w = super.openVariantContextWriter(this.outputFile);
w = this.writingVariants.open(this.outputFile);
w.writeHeader(header);
final Pattern tab=Pattern.compile("[\t]");
final CharSplitter tab=CharSplitter.TAB;
String line;
final Pattern iupacATGC = Pattern.compile("[atgcATGC]");
final Predicate<String> iupacATGC = S->AcidNucleics.isATGC(S);

while((line=r.readLine())!=null) {
String tokens[]=tab.split(line);
final String tokens[]=tab.split(line);
if(tokens.length!=6) {
LOG.error("expected 6 column in "+line);
return -1;
Expand Down Expand Up @@ -203,9 +211,9 @@ public int doWork(final List<String> args) {
vcb.chr(ssr.getSequenceName());
vcb.attribute(morgan.getID(), Float.parseFloat(tokens[2]));

if(iupacATGC.matcher(tokens[4]).matches() && iupacATGC.matcher(tokens[5]).matches())
if(iupacATGC.test(tokens[4]) && iupacATGC.test(tokens[5]))
{
String refBase=String.valueOf(genomic.charAt(pos1-1));
final String refBase=String.valueOf(genomic.charAt(pos1-1));
ref= Allele.create(refBase,true);
a1 = refBase.equalsIgnoreCase(tokens[4])?
ref:
Expand All @@ -215,15 +223,25 @@ public int doWork(final List<String> args) {
ref:
Allele.create(tokens[5],false)
;
vcb.attribute(svtype.getID(),
a1.isReference() && a2.isReference()?
"NOVARIATION":
"SNV"
);
final String type;
if(a1.isReference() && a2.isReference()) {
type = "NOVARIATION";
}
else if(tokens[4].length() < tokens[5].length()) {
type = "INS";
}
else if(tokens[4].length() > tokens[5].length()) {
type = "DEL";
}
else
{
type = "SNV";
}
vcb.attribute(svtype.getID(),type);
}
else if(
(tokens[4].equals("-") && iupacATGC.matcher(tokens[5]).matches()) ||
(tokens[5].equals("-") && iupacATGC.matcher(tokens[4]).matches())
(tokens[4].equals("-") && iupacATGC.test(tokens[5])) ||
(tokens[5].equals("-") && iupacATGC.test(tokens[4]))
) {
pos1--;//shift left
String refBase= String.valueOf(genomic.charAt(pos1-1));
Expand Down Expand Up @@ -270,7 +288,7 @@ else if(tokens[4].equals("-") && tokens[5].equals("-")) {
CloserUtil.close(r);
}
}
public static void main(String[] args) {
public static void main(final String[] args) {
new BimToVcf().instanceMainWithExit(args);
}
}
Loading

0 comments on commit c81f95b

Please sign in to comment.