[go: nahoru, domu]

Skip to content

Commit

Permalink
Implement fill_missing (enso-org#1372)
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd committed Dec 22, 2020
1 parent 6075c1e commit ab51bff
Show file tree
Hide file tree
Showing 11 changed files with 221 additions and 40 deletions.
23 changes: 18 additions & 5 deletions distribution/std-lib/Table/src/Data/Column.enso
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from Base import all
import Table.Data.Table
import Table.Data.Storage

polyglot java import org.enso.table.data.table.Column as Java_Column

Expand Down Expand Up @@ -136,6 +137,17 @@ type Column
is_missing : Column
is_missing = here.run_vectorized_unary_op this "is_missing" (== Nothing)

## Returns a new column where missing values have been replaced with the
provided default.
fill_missing : Any -> Column
fill_missing default =
storage = this.java_column.getStorage []
index = this.java_column.getIndex []
name = this.java_column.getName []
new_st = storage.fillMissing [default]
col = Java_Column.new [name, index, new_st].to_array
Column col

## Applies `function` to each item in this column and returns the column
of results.
map function =
Expand Down Expand Up @@ -175,13 +187,14 @@ type Column
to_vector = Vector.new this.length this.at

## Returns the underlying storage type of this column.
storage_type : Storage.Type
storage_type =
tp = this.java_column.getStorage [] . getType []
if tp == Storage_Type_String then Text else
if tp == Storage_Type_Long then Integer else
if tp == Storage_Type_Double then Decimal else
if tp == Storage_Type_Bool then Boolean else
Any
if tp == Storage_Type_String then Storage.Text else
if tp == Storage_Type_Long then Storage.Integer else
if tp == Storage_Type_Double then Storage.Decimal else
if tp == Storage_Type_Bool then Storage.Boolean else
Storage.Any

## Converts this column to JSON.
to_json =
Expand Down
7 changes: 7 additions & 0 deletions distribution/std-lib/Table/src/Data/Storage.enso
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## Represents different types of underlying storage for Columns.
type Type
type Text
type Integer
type Decimal
type Boolean
type Any
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
package org.enso.table.data.column.builder.object;

import java.util.BitSet;
import org.enso.table.data.column.storage.BoolStorage;
import org.enso.table.data.column.storage.Storage;

import java.util.BitSet;

/**
* A builder for boolean columns.
*/
/** A builder for boolean columns. */
public class BoolBuilder extends TypedBuilder {
private final BitSet vals = new BitSet();
private final BitSet isNa = new BitSet();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
package org.enso.table.data.column.builder.object;

import java.util.BitSet;
import org.enso.table.data.column.storage.DoubleStorage;
import org.enso.table.data.column.storage.LongStorage;
import org.enso.table.data.column.storage.Storage;

import java.util.BitSet;

/**
* A builder for numeric columns.
*/
/** A builder for numeric columns. */
public class NumericBuilder extends TypedBuilder {
private boolean isDouble;
private int currentSize;
private final int size;
private final BitSet isMissing = new BitSet();
private final long[] data;
private boolean isDouble;
private int currentSize;

private NumericBuilder(boolean isDouble, int size) {
this.size = size;
Expand Down Expand Up @@ -79,6 +76,16 @@ public void append(Object o) {
}
}

/**
* Append a new item in raw form to this builder.
*
* @param rawData the raw encoding of the item, for long numbers just the number and for doubles,
* its long bytes
*/
public void appendRaw(long rawData) {
data[currentSize++] = rawData;
}

@Override
public int getCurrentSize() {
return currentSize;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.MapOperation;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.index.Index;
import org.enso.table.error.UnexpectedColumnTypeException;
import org.enso.table.error.UnexpectedTypeException;

import java.util.BitSet;

/** A boolean column storage. */
public class BoolStorage extends Storage {
private static final MapOpStorage<BoolStorage> ops = buildOps();
Expand Down Expand Up @@ -36,7 +35,7 @@ public long getType() {

@Override
public Object getItemBoxed(int idx) {
return isMissing.get(idx) ? null : values.get(idx);
return isMissing.get(idx) ? null : getItem(idx);
}

public boolean getItem(long idx) {
Expand Down Expand Up @@ -71,6 +70,32 @@ public BitSet getIsMissing() {
return isMissing;
}

/**
* Creates a new BoolStorage in which all missing values have been replaced by arg.
*
* <p>It works by setting the new isMissing to an empty bitset and changing the values bitset
* accordingly. If `arg` is true, new values are `values || isMissing` and if `arg` is false, new
* values are `values && (~isMissing)`.
*/
private BoolStorage fillMissingBoolean(boolean arg) {
final var newValues = (BitSet) values.clone();
if (arg) {
newValues.or(isMissing);
} else {
newValues.andNot(isMissing);
}
return new BoolStorage(newValues, new BitSet(), size, negated);
}

@Override
public Storage fillMissing(Object arg) {
if (arg instanceof Boolean) {
return fillMissingBoolean((Boolean) arg);
} else {
return super.fillMissing(arg);
}
}

@Override
public Storage mask(BitSet mask, int cardinality) {
BitSet newMissing = new BitSet();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import org.enso.table.data.column.builder.object.NumericBuilder;
import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.MapOperation;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.column.operation.map.numeric.DoubleBooleanOp;
import org.enso.table.data.column.operation.map.numeric.DoubleNumericOp;
import org.enso.table.data.index.Index;

import java.util.BitSet;

/** A column containing floating point numbers. */
public class DoubleStorage extends Storage {
private final long[] data;
Expand Down Expand Up @@ -74,6 +73,30 @@ protected Storage runVectorizedZip(String name, Storage argument) {
return ops.runMap(name, this, argument);
}

private Storage fillMissingDouble(double arg) {
final var builder = NumericBuilder.createDoubleBuilder(size());
long rawArg = Double.doubleToRawLongBits(arg);
for (int i = 0; i < size(); i++) {
if (isMissing.get(i)) {
builder.appendRaw(rawArg);
} else {
builder.appendRaw(data[i]);
}
}
return builder.seal();
}

@Override
public Storage fillMissing(Object arg) {
if (arg instanceof Double) {
return fillMissingDouble((Double) arg);
} else if (arg instanceof Long) {
return fillMissingDouble((Long) arg);
} else {
return super.fillMissing(arg);
}
}

@Override
public DoubleStorage mask(BitSet mask, int cardinality) {
BitSet newMissing = new BitSet();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import org.enso.table.data.column.builder.object.NumericBuilder;
import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.MapOperation;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.column.operation.map.numeric.LongBooleanOp;
import org.enso.table.data.column.operation.map.numeric.LongNumericOp;
import org.enso.table.data.index.Index;

import java.util.BitSet;

/** A column storing 64-bit integers. */
public class LongStorage extends Storage {
private final long[] data;
Expand Down Expand Up @@ -74,6 +73,43 @@ protected Storage runVectorizedZip(String name, Storage argument) {
return ops.runZip(name, this, argument);
}

private Storage fillMissingDouble(double arg) {
final var builder = NumericBuilder.createDoubleBuilder(size());
long rawArg = Double.doubleToRawLongBits(arg);
for (int i = 0; i < size(); i++) {
if (isMissing.get(i)) {
builder.appendRaw(rawArg);
} else {
double coerced = data[i];
builder.appendRaw(Double.doubleToRawLongBits(coerced));
}
}
return builder.seal();
}

private Storage fillMissingLong(long arg) {
final var builder = NumericBuilder.createLongBuilder(size());
for (int i = 0; i < size(); i++) {
if (isMissing.get(i)) {
builder.appendRaw(arg);
} else {
builder.appendRaw(data[i]);
}
}
return builder.seal();
}

@Override
public Storage fillMissing(Object arg) {
if (arg instanceof Double) {
return fillMissingDouble((Double) arg);
} else if (arg instanceof Long) {
return fillMissingLong((Long) arg);
} else {
return super.fillMissing(arg);
}
}

@Override
public LongStorage mask(BitSet mask, int cardinality) {
BitSet newMissing = new BitSet();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
package org.enso.table.data.column.storage;

import org.enso.table.data.column.builder.object.BoolBuilder;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.InferredBuilder;
import java.util.BitSet;
import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.MapOperation;
import org.enso.table.data.column.operation.map.UnaryMapOperation;
import org.enso.table.data.index.Index;

import java.util.BitSet;
import java.util.function.Function;

/** A column storing arbitrary objects. */
public class ObjectStorage extends Storage {
private final Object[] data;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
package org.enso.table.data.column.storage;

import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.InferredBuilder;
import org.enso.table.data.column.operation.map.MapOpStorage;

import java.util.BitSet;
import java.util.function.BiFunction;
import java.util.function.Function;
import org.enso.table.data.column.builder.object.Builder;
import org.enso.table.data.column.builder.object.InferredBuilder;
import org.enso.table.data.column.builder.object.ObjectBuilder;

/** An abstract representation of a data column. */
public abstract class Storage {
Expand Down Expand Up @@ -146,6 +145,28 @@ public final Storage zip(String name, BiFunction<Object, Object, Object> functio
return builder.seal();
}

/**
* Return a new storage, where missing elements have been replaced by arg.
*
* @param arg the value to use for missing elements
* @return a new storage, with all missing elements replaced by arg
*/
public Storage fillMissing(Object arg) {
return fillMissingHelper(arg, new ObjectBuilder(size()));
}

protected final Storage fillMissingHelper(Object arg, Builder builder) {
for (int i = 0; i < size(); i++) {
Object it = getItemBoxed(i);
if (it == null) {
builder.append(arg);
} else {
builder.append(it);
}
}
return builder.seal();
}

/**
* Return a new storage, containing only the items marked true in the mask.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
package org.enso.table.data.column.storage;

import java.util.BitSet;
import org.enso.table.data.column.builder.object.StringBuilder;
import org.enso.table.data.column.operation.map.MapOpStorage;
import org.enso.table.data.column.operation.map.MapOperation;
import org.enso.table.data.index.Index;

import java.util.BitSet;

/** A column storing strings. */
public class StringStorage extends ObjectStorage {
Expand Down Expand Up @@ -48,6 +47,15 @@ protected Storage runVectorizedZip(String name, Storage argument) {
return ops.runZip(name, this, argument);
}

@Override
public Storage fillMissing(Object arg) {
if (arg instanceof String) {
return fillMissingHelper(arg, new StringBuilder(size()));
} else {
return super.fillMissing(arg);
}
}

@Override
public StringStorage mask(BitSet mask, int cardinality) {
ObjectStorage storage = super.mask(mask, cardinality);
Expand Down
Loading

0 comments on commit ab51bff

Please sign in to comment.