Spaces:
Running
Running
namespace at { | |
/** | |
* UnknownQuantizer is a placeholder quantizer for functions that implement | |
* quantization in a two step process. First a tensor is allocated but with | |
* unknown quantizer, and then the quantization kernel decides what the final | |
* quantizer will be. | |
*/ | |
struct TORCH_API UnknownQuantizer : public Quantizer { | |
explicit UnknownQuantizer(ScalarType scalar_type) | |
: Quantizer(scalar_type) {} | |
Tensor quantize(const Tensor& tensor) override; | |
Tensor dequantize(const Tensor& qtensor) override; | |
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; | |
QScheme qscheme() const override; | |
bool equalTo(QuantizerPtr other) const override; | |
}; | |
/** | |
* UniformQuantizer is the parent class for all uniform quantizers. | |
* These quantization scheme will map float value uniformly to | |
* the quantized value. For example, affine quantizer is | |
* the most commonly used scheme in this category. | |
*/ | |
struct TORCH_API UniformQuantizer : public Quantizer { | |
explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} | |
}; | |
/** | |
* NonUniformQuantizer is the parent class for all non-uniform quantizers. | |
* These quantization scheme may map float value non-uniformly to the quantized | |
* value. K-means quantization is a representative example in this category. | |
*/ | |
struct TORCH_API NonUniformQuantizer : public Quantizer { | |
explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {} | |
}; | |
// There is also StochasticQuantizer which is uniform but not affine | |
/** | |
* AffineQuantizer uses affine transformation to do quantization. | |
* | |
* For quantize: | |
* Y = clamp(round(X / scale + zero_point), min, max) | |
* For dequantize: | |
* X = (Y - zero_point) * scale | |
*/ | |
struct TORCH_API AffineQuantizer : public UniformQuantizer { | |
explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {} | |
}; | |
// Note that we will not have Symmetric Quantizer in backend to reduce | |
// complications in quantized kernel implementation. | |
/** | |
* PerTensorAffineQuantizer stores a scale and a zero_point, which is used for | |
* all the values in the Tensor. | |
*/ | |
struct TORCH_API PerTensorAffineQuantizer : public AffineQuantizer { | |
explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point) | |
: AffineQuantizer(scalar_type), | |
scale_(scale), | |
zero_point_(zero_point) {} | |
Tensor quantize(const Tensor& tensor) override; | |
Tensor dequantize(const Tensor& qtensor) override; | |
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; | |
QScheme qscheme() const override { | |
return kPerTensorAffine; | |
} | |
double scale() const { | |
return scale_; | |
} | |
int64_t zero_point() const { | |
return zero_point_; | |
} | |
bool equalTo(QuantizerPtr other) const override { | |
if (!other.get() || other->qscheme() != kPerTensorAffine) { | |
return false; | |
} | |
auto* other_per_tensor_affine = | |
static_cast<PerTensorAffineQuantizer*>(other.get()); | |
return scalar_type() == other_per_tensor_affine->scalar_type() && | |
scale() == other_per_tensor_affine->scale() && | |
zero_point() == other_per_tensor_affine->zero_point(); | |
} | |
private: | |
const double scale_; | |
// We use int64_t for consistency with Python | |
const int64_t zero_point_; | |
}; | |
/** | |
* PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer | |
* except that we have an independent scale and zero_point parameter | |
* for each channel. | |
* | |
* Also note that per channel quantization is mostly applied to output channels | |
* of weights since per-input channel of weight quantization or per-channel | |
* quantization for activations can't be efficiently supported in most of | |
* processors since it requires each multiplication result within a single | |
* dot-product to have a different scale. | |
*/ | |
struct TORCH_API PerChannelAffineQuantizer : public AffineQuantizer { | |
explicit PerChannelAffineQuantizer( | |
ScalarType scalar_type, | |
Tensor scales, | |
Tensor zero_points, | |
int64_t axis) | |
: AffineQuantizer(scalar_type), | |
scales_(std::move(scales)), | |
zero_points_(std::move(zero_points)), | |
axis_(axis) {} | |
QScheme qscheme() const override { | |
return kPerChannelAffine; | |
} | |
Tensor scales() const { | |
return scales_; | |
} | |
Tensor zero_points() const { | |
return zero_points_; | |
} | |
int64_t axis() const { | |
return axis_; | |
} | |
Tensor quantize(const Tensor& tensor) override; | |
Tensor dequantize(const Tensor& qtensor) override; | |
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; | |
bool equalTo(QuantizerPtr other) const override { | |
if (!other.get() || other->qscheme() != kPerChannelAffine) { | |
return false; | |
} | |
auto* other_per_channel_affine = | |
static_cast<PerChannelAffineQuantizer*>(other.get()); | |
return scalar_type() == other_per_channel_affine->scalar_type() && | |
scales().equal(other_per_channel_affine->scales()) && | |
zero_points().equal(other_per_channel_affine->zero_points()) && | |
axis() == other_per_channel_affine->axis(); | |
} | |
protected: | |
Tensor scales_; | |
Tensor zero_points_; | |
const int64_t axis_; | |
}; | |
/** | |
* PerChannelAffineFloatQParamsQuantizer is the same as PerChannelAffineQuantizer | |
* except that it expects both scale and zero point to be floating point values. | |
* | |
* This quantizer uses the kPerChannelAffineFloatQParams qscheme which is a variant of | |
* kPerChannelAffine. | |
* | |
* The quantize equation in this case looks like - | |
* Xq = (Xf - zero_point) * inv_scale, where inv_scale = 1.0/scale | |
* | |
* Note: Usage of floating point zero point is useful in cases where 0 doesn't need to | |
* be exactly represented in the quantized space. We can get additional precision by | |
* using floating point values for zero point. | |
*/ | |
struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffineQuantizer { | |
explicit PerChannelAffineFloatQParamsQuantizer( | |
ScalarType scalar_type, | |
Tensor scales, | |
Tensor zero_points, | |
int64_t axis) | |
: PerChannelAffineQuantizer(scalar_type, | |
scales, | |
zero_points, | |
axis) {} | |
QScheme qscheme() const override { | |
return kPerChannelAffineFloatQParams; | |
} | |
Tensor quantize(const Tensor& tensor) override; | |
Tensor dequantize(const Tensor& qtensor) override; | |
Tensor& dequantize_out(Tensor& rtensor, const Tensor& qtensor) override; | |
bool equalTo(QuantizerPtr other) const override { | |
if (!other.get() || other->qscheme() != kPerChannelAffineFloatQParams) { | |
return false; | |
} | |
auto* other_per_channel_float_qparams = | |
static_cast<PerChannelAffineFloatQParamsQuantizer*>(other.get()); | |
return scalar_type() == other_per_channel_float_qparams->scalar_type() && | |
scales().equal(other_per_channel_float_qparams->scales()) && | |
zero_points().equal(other_per_channel_float_qparams->zero_points()) && | |
axis() == other_per_channel_float_qparams->axis(); | |
} | |
}; | |
// This is an internal utility function for getting at the QTensorImpl, | |
// You should only use this for writing low level | |
// setters/getters for QTensorImpl fields; otherwise, you should use | |
// the low level setters/getters that were implemented using this. | |
// This may be called repeatedly, so make sure it's pretty cheap. | |
TORCH_API QTensorImpl* get_qtensorimpl(const TensorBase& self); | |
// double and int64_t are because of the native function API, we only have these | |
// argument types right now in native functions | |
TORCH_API QuantizerPtr | |
make_per_tensor_affine_quantizer( | |
double scale, int64_t zero_point, ScalarType scalar_type); | |
TORCH_API QuantizerPtr make_per_channel_affine_quantizer( | |
const Tensor& scales, | |
const Tensor& zero_points, | |
int64_t axis, | |
ScalarType scalar_type); | |
TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type); | |
// Create a Quantized Tensor given arguments for normal Tensor and a quantizer | |
TORCH_API Tensor new_qtensor( | |
IntArrayRef sizes, | |
const TensorOptions& options, | |
QuantizerPtr quantizer); | |
TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer); | |
TORCH_API Tensor from_blob_quantized_per_tensor_affine( | |
void* data, | |
IntArrayRef sizes, | |
IntArrayRef strides, | |
std::function<void(void*)> deleter, | |
const float scale, | |
const int64_t zeroPoint, | |
const TensorOptions& options); | |
TORCH_API Tensor from_blob_quantized_per_tensor_affine( | |
void* data, | |
IntArrayRef sizes, | |
std::function<void(void*)> deleter, | |
const float scale, | |
const int64_t zeroPoint, | |
const TensorOptions& options); | |
TORCH_API Tensor from_blob_quantized_per_channel_affine( | |
void* data, | |
IntArrayRef sizes, | |
std::function<void(void*)> deleter, | |
const Tensor& scales, | |
const Tensor& zero_points, | |
const int64_t axis, | |
const TensorOptions& options); | |
} // namespace at | |