Creating lambda expressions using expression patterns in Cuda

I am developing a CUDA application that requires me to pass some arbitrary function to the CUDA core. Since declaring a function pointer for every possible situation and passing them to the kernel would be too complicated (> 50 different functions), and all of them are compositions of elementary functions, such as sin(x)/y, I would like to have some minimal Lambda-Expression functionality for kernels CUDA. Since C ++ 11 functions are not yet supported by the device code (as far as I know), and I could not find any relevant information on the Internet, I decided to teach myself expression patterns and implement some simple rules for lambda expressions for passing to the kernel.

I came up with the following code, which is a minimal implementation that compiles to NVCC and works fine. However, going down this path, I can only implement functions with 1 variable. Is there a way to extend my code to handle composite functions, for example sin(_x) + _y?

Thanks in advance!

#include<math.h>

#ifdef __CUDACC__
#define HOST_DEVICE __host__ __device__
#else
#define HOST_DEVICE
#endif

struct Id {};

template <typename Op, typename Left, typename Right>
struct BinaryOp
{
    Left left;
    Right right;
    HOST_DEVICE BinaryOp(Left t1, Right t2) : left(t1), right(t2) {}

    HOST_DEVICE double operator() (double x) {
        return Op::apply(left(x), right(x));
    }
};

template <typename Op, typename Arg>
struct UnaryOp
{
    Arg arg;
    HOST_DEVICE UnaryOp(Arg t1) : arg(t1) {}

    HOST_DEVICE double operator() (double x) {
        return Op::apply(arg(x));
    }
};

template <>
struct UnaryOp<Id, double>
{
    HOST_DEVICE UnaryOp() {}
    HOST_DEVICE double operator() (double x) {
        return x;
    }
};

struct Sin
{
    HOST_DEVICE static double apply(double x) {
        return sin(x);
    }
};

struct Plus
{
    HOST_DEVICE static double apply(double a, double b) {
        return a + b;
    }
};

template <typename Left, typename Right>
BinaryOp<Plus, Left, Right> operator+ (Left lhs, Right rhs) {
    return BinaryOp<Plus, Left, Right>(lhs, rhs);
}

template <typename Arg>
UnaryOp<Sin, Arg> _sin(Arg arg) {
    return UnaryOp<Sin, Arg>(arg);
}

template <class T>
__global__ void test(T func, double x) {
    printf("%e\n", func(x));
}

int main () 
{
    UnaryOp<Id, double> _x;
    double x = 1.0;
    test<<<1, 1>>>(_sin(_x) + _x, x);
    cudaDeviceSynchronize();  // Needed or the host will return before kernel is finished
    return 0;
}
+3
source share
1 answer

So, I spent some time after asking a question and cracking a solution. It is ugly, but it works for me. Here is a modified code that supports up to 3 free variables. Other variables may be hardcoded, but at the moment I do not need my project.

#include<math.h>

#ifdef __CUDACC__
#define HOST_DEVICE __host__ __device__
#else
#define HOST_DEVICE
#endif

struct Id {};

template <typename Op, typename Left, typename Right>
struct BinaryOp
{
    Left left;
    Right right;
    HOST_DEVICE BinaryOp(Left t1, Right t2) : left(t1), right(t2) {}

    HOST_DEVICE double operator() (double x1, double x2 = 0.0, double x3 = 0.0) {
        return Op::apply(left(x1, x2, x3), right(x1, x2, x3));
    }
};

template <typename Op, typename Arg>
struct UnaryOp
{
    Arg arg;
    HOST_DEVICE UnaryOp(Arg t1) : arg(t1) {}

    HOST_DEVICE double operator() (double x1, double x2 = 0.0, double x3 = 0.0) {
        return Op::apply(arg(x1, x2, x3));
    }
};

template <int argnum>
struct Var
{
    HOST_DEVICE Var() {}
    HOST_DEVICE double operator() (double x1, double x2 = 0.0, double x3 = 0.0) {
        if (1 == argnum) return x1;
        else if (2 == argnum) return x2;
        else return x3;
    }
};

struct Sin
{
    HOST_DEVICE static double apply(double x) {
        return sin(x);
    }
};

struct Plus
{
    HOST_DEVICE static double apply(double a, double b) {
        return a + b;
    }
};

template <typename Left, typename Right>
BinaryOp<Plus, Left, Right> operator+ (Left lhs, Right rhs) {
    return BinaryOp<Plus, Left, Right>(lhs, rhs);
}

template <typename Arg>
UnaryOp<Sin, Arg> _sin(Arg arg) {
    return UnaryOp<Sin, Arg>(arg);
}

template <class T>
__global__ void test(T func, double x, double y, double z = 0.0) {
    printf("%e\n", func(x, y));
}

Var<1> _x;
Var<2> _y;

int main () 
{
    test<<<1, 1>>>(_sin(_x) + _y, 1.0, 2.0);
    cudaDeviceSynchronize();  // Needed or the host will return before kernel is finished
    return 0;
}

. - double ( , double). , . , NVCC ++ 11, .

, - , . !

+1

All Articles