I am interested in to understand when I have complex function, it is better execute it on CPU or it is better execute it on GPU.I try to have a kernel time consuming, but I do not have any Idea for achieve it.I did many iterations of a small kernel and compared the result of execution time between CPU and GPU but they were not good results and always I have poor speedup.Also, I read some links about computation intensity but still I am confused.Could you help me how can I have a kernel time consuming?my code is:
using namespace tbb;int dim = 2000;class Clock {private: typedef std::chrono::high_resolution_clock clock; std::chrono::time_point<clock> t;public: Clock() { start(); } void start() { t = clock::now(); } double stop() const { return std::chrono::duration_cast<std::chrono::microseconds>( clock::now() - t).count(); }};template<class Tin, class Tout, class Function>class Map {private: Function fun;public: Map() { } Map(Function f) : fun(f) { } std::vector<Tout> operator()(bool use_tbb, std::vector<Tin> &v) { std::vector<Tout> r(v.size()); if (use_tbb) { uTimer *timer = new uTimer("Executing Code On CPU");// start program tbb::parallel_for(tbb::blocked_range < Tin > (0, v.size()), [&](tbb::blocked_range<Tin> t) { for (int index = t.begin(); index < t.end(); ++index) { r[index] = fun(v[kindex);} }); timer->~uTimer(); return r; } else { //sycl::queue gpuQueue { sycl::gpu_selector() }; sycl::range < 1 > n_item { v.size() }; sycl::buffer<Tin, 1> in_buffer(&v[0], n_item); sycl::buffer<Tout, 1> out_buffer(&r[0], n_item); //Profiling GPU // Initialize property list with profiling information sycl::property_list propList { sycl::property::queue::enable_profiling() }; // Build the command queue (constructed to handle event profling) sycl::queue gpuQueue = cl::sycl::queue(sycl::gpu_selector(), propList); // print out the device information used for the kernel code std::cout << "Device: "<< gpuQueue.get_device().get_info<sycl::info::device::name>()<< std::endl; std::cout << "Compute Units: "<< gpuQueue.get_device().get_info< sycl::info::device::max_compute_units>()<< std::endl; auto start_overall = std::chrono::system_clock::now(); auto event = gpuQueue.submit([&](sycl::handler &h) { //local copy of fun auto f = fun; sycl::accessor in_accessor(in_buffer, h, sycl::read_only); sycl::accessor out_accessor(out_buffer, h, sycl::read_write); h.parallel_for(n_item, [=](sycl::id<1> index) { out_accessor[index] = f(in_accessor[index]); }); }); event.wait(); auto end_overall = std::chrono::system_clock::now(); cl_ulong submit_time = event.template get_profiling_info< cl::sycl::info::event_profiling::command_submit>(); cl_ulong start_time = event.template get_profiling_info< cl::sycl::info::event_profiling::command_start>(); cl_ulong end_time = event.template get_profiling_info< cl::sycl::info::event_profiling::command_end>(); auto submission_time = (start_time - submit_time) / 1000000.0f; std::cout << "Submit Time: " << submission_time << " ms"<< std::endl; auto execution_time = (end_time - start_time) / 1000000.0f; std::cout << "Execution Time: " << execution_time << " ms"<< std::endl; auto execution_overall = std::chrono::duration_cast< std::chrono::milliseconds > (end_overall - start_overall); std::cout << "Overall Execution Time: " << execution_overall.count()<< " ms" << std::endl; } return r; }};template<class Tin, class Tout, class Function>Map<Tin, Tout, Function> make_map(Function f) { return Map<Tin, Tout, Function>(f);}int main(int argc, char *argv[]) {std::vector<double> r;int iterator = 10;std::cout << "The Exutable File! " << argv[0] <<std::endl;std::cout << "The Device Is! " << argv[1] << std::endl;std::cout << "with Vector Size! " << argv[2] << std::endl;//The Devicestd::string device = argv[1];//size of vector int n = std::stoi(argv[2]);std::vector<double> v;//Type of random number distributionstd::uniform_real_distribution<double> dist(-10,10); //(min, max)//Mersenne Twister: Good quality random number generatorstd::mt19937 rng;//Initialize with non-deterministic seedsrng.seed(std::random_device{}());//Boundsint d = 100;// generate n random numbers.for (int i=0; i<n; i++){ v.push_back(dist(rng)); }//Define a Functionauto f = [](double x){ return (pow(x,2) - 10 * cos(2 * 3.142* x) *log(x) + 10); };//Define a Map auto m1 = make_map<double, double>(f);if (device == "gpu"){ for(int j = 0; j<= iterator; j++){ r = m1(false, v); }}if (device == "cpu"){ for(int j = 0; j<= iterator; j++){ r = m1(true, v); }}} //print the result// for (auto &e : r){//std::cout << "Teh values are: " << e << " ";// } return 0;}
For example the result of executing function on CPU and GPU with vector size equal to 10000000 in second is:
CPU with second: GPU with second:1- 0.04855 0.4042- 0.022438 0.208and speedup:1- 0.1201732 s2- 0.107875 s
I need high speedup.