Quantcast
Channel: Active questions tagged kernel - Stack Overflow
Viewing all articles
Browse latest Browse all 6334

Kernel time consuming+sycl

$
0
0

I am interested in to understand when I have complex function, it is better execute it on CPU or it is better execute it on GPU.I try to have a kernel time consuming, but I do not have any Idea for achieve it.I did many iterations of a small kernel and compared the result of execution time between CPU and GPU but they were not good results and always I have poor speedup.Also, I read some links about computation intensity but still I am confused.Could you help me how can I have a kernel time consuming?my code is:

 using namespace tbb;int dim = 2000;class Clock {private:    typedef std::chrono::high_resolution_clock clock;    std::chrono::time_point<clock> t;public:    Clock() {        start();    }    void start() {        t = clock::now();    }    double stop() const {        return std::chrono::duration_cast<std::chrono::microseconds>(                clock::now() - t).count();    }};template<class Tin, class Tout, class Function>class Map {private:    Function fun;public:    Map() {    }    Map(Function f) :            fun(f) {    }    std::vector<Tout> operator()(bool use_tbb, std::vector<Tin> &v) {        std::vector<Tout> r(v.size());        if (use_tbb) {            uTimer *timer = new uTimer("Executing Code On CPU");// start program    tbb::parallel_for(tbb::blocked_range < Tin > (0, v.size()),                    [&](tbb::blocked_range<Tin> t) {                        for (int index = t.begin(); index < t.end(); ++index) {                            r[index] = fun(v[kindex);}                    });            timer->~uTimer();            return r;        } else {            //sycl::queue gpuQueue { sycl::gpu_selector() };            sycl::range < 1 > n_item { v.size() };            sycl::buffer<Tin, 1> in_buffer(&v[0], n_item);            sycl::buffer<Tout, 1> out_buffer(&r[0], n_item);            //Profiling GPU            // Initialize property list with profiling information            sycl::property_list propList {                    sycl::property::queue::enable_profiling() };            // Build the command queue (constructed to handle event profling)            sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(),                    propList);            // print out the device information used for the kernel code            std::cout << "Device: "<< gpuQueue.get_device().get_info<sycl::info::device::name>()<< std::endl;            std::cout << "Compute Units: "<< gpuQueue.get_device().get_info<                            sycl::info::device::max_compute_units>()<< std::endl;            auto start_overall = std::chrono::system_clock::now();            auto event = gpuQueue.submit([&](sycl::handler &h) {                //local copy of fun                auto f = fun;                sycl::accessor in_accessor(in_buffer, h, sycl::read_only);                sycl::accessor out_accessor(out_buffer, h, sycl::read_write);                h.parallel_for(n_item, [=](sycl::id<1> index) {                    out_accessor[index] =  f(in_accessor[index]);                });            });            event.wait();            auto end_overall = std::chrono::system_clock::now();            cl_ulong submit_time = event.template get_profiling_info<                    cl::sycl::info::event_profiling::command_submit>();            cl_ulong start_time = event.template get_profiling_info<                    cl::sycl::info::event_profiling::command_start>();            cl_ulong end_time = event.template get_profiling_info<                    cl::sycl::info::event_profiling::command_end>();            auto submission_time = (start_time - submit_time) / 1000000.0f;            std::cout << "Submit Time: " << submission_time << " ms"<< std::endl;            auto execution_time = (end_time - start_time) / 1000000.0f;            std::cout << "Execution Time: " << execution_time << " ms"<< std::endl;            auto execution_overall = std::chrono::duration_cast< std::chrono::milliseconds > (end_overall - start_overall);            std::cout << "Overall Execution Time: " << execution_overall.count()<< " ms" << std::endl;        }        return r;    }};template<class Tin, class Tout, class Function>Map<Tin, Tout, Function> make_map(Function f) {    return Map<Tin, Tout, Function>(f);}int main(int argc, char *argv[]) {std::vector<double> r;int iterator = 10;std::cout << "The Exutable File! " << argv[0] <<std::endl;std::cout << "The Device Is! " << argv[1] << std::endl;std::cout << "with Vector Size! " << argv[2] << std::endl;//The Devicestd::string device = argv[1];//size of vector    int n = std::stoi(argv[2]);std::vector<double> v;//Type of random number distributionstd::uniform_real_distribution<double> dist(-10,10); //(min, max)//Mersenne Twister: Good quality random number generatorstd::mt19937 rng;//Initialize with non-deterministic seedsrng.seed(std::random_device{}());//Boundsint d = 100;// generate n random numbers.for (int i=0; i<n; i++){    v.push_back(dist(rng));     }//Define a Functionauto f = [](double x){ return (pow(x,2) - 10 * cos(2 * 3.142* x) *log(x) + 10); };//Define a Map    auto m1 = make_map<double, double>(f);if (device == "gpu"){    for(int j = 0; j<= iterator; j++){        r = m1(false, v);    }}if (device == "cpu"){    for(int j = 0; j<= iterator; j++){        r = m1(true, v);    }}}    //print the result//  for (auto &e : r){//std::cout << "Teh values are: " << e << " ";//  }    return 0;}

For example the result of executing function on CPU and GPU with vector size equal to 10000000 in second is:

CPU with second:                                  GPU with second:1- 0.04855                                               0.4042- 0.022438                                             0.208and speedup:1- 0.1201732 s2- 0.107875  s

I need high speedup.


Viewing all articles
Browse latest Browse all 6334

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>