diff --git a/bin/pareval b/bin/pareval new file mode 100755 index 0000000..3f2ea12 --- /dev/null +++ b/bin/pareval @@ -0,0 +1,89 @@ +#!/bin/bash +# Wrapper script to run the components of ParEval. +# It can be run as: +# pareval [options] +# +# Commands: +# generate - Generate LLM outputs for ParEval. See generate/generate.py for full argument list. +# evaluate - Evaluate LLM outputs for ParEval. See drivers/run-all.py for full argument list. +# help | -h | --help - Show a help message. +# version | -v | --version - Show the version of ParEval. + +VERSION="v1.1" + +if [[ "$#" -eq 0 ]]; then + echo "No command provided. Use 'pareval help' for usage information." + exit 1 +fi + +if [[ -z "$PAREVAL_ROOT" ]]; then + PAREVAL_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +fi + + +command="$1" +shift + +MODE="" +case "$command" in + generate) + MODE="generate" + ;; + evaluate) + MODE="evaluate" + ;; + help | -h | --help) + echo "ParEval - A framework for evaluating LLMs on parallel code generation tasks." + echo "Usage: pareval [options]" + echo "" + echo "Commands:" + echo " generate Generate LLM outputs for ParEval. See generate/generate.py for full argument list." + echo " evaluate Evaluate LLM outputs for ParEval. See drivers/run-all.py for full argument list." + echo " help Show this help message." + echo " -h, --help Show this help message." + echo " version Show the version of ParEval." + echo " -v, --version Show the version of ParEval." + echo "" + echo "For detailed usage of each command, run 'pareval --help'." + echo "" + echo "For more information, visit the ParEval GitHub repository: https://github.com/parallelcodefoundry/ParEval" + echo "" + exit 0 + ;; + version | -v | --version) + echo "ParEval version: $VERSION" + exit 0 + ;; + *) + echo "Unknown command: $command. Use 'pareval help' for usage information." + exit 1 + ;; +esac + +# check that mode is valid +if [[ "$MODE" != "generate" && "$MODE" != "evaluate" ]]; then + echo "Invalid mode: $MODE. Use 'pareval help' for usage information." + exit 1 +fi + +# generate mode +if [[ "$MODE" == "generate" ]]; then + # Check if the generate script exists + if [[ ! -f "${PAREVAL_ROOT}/generate/generate.py" ]]; then + echo "Error: generate script not found in '${PAREVAL_ROOT}'. Please ensure you are in the correct directory." + exit 1 + fi + + python ${PAREVAL_ROOT}/generate/generate.py "$@" +fi + +# evaluate mode +if [[ "$MODE" == "evaluate" ]]; then + # Check if the evaluate script exists + if [[ ! -f "${PAREVAL_ROOT}/drivers/run-all.py" ]]; then + echo "Error: evaluate script not found '${PAREVAL_ROOT}'. Please ensure you are in the correct directory." + exit 1 + fi + + PYTHONPATH="${PAREVAL_ROOT}:${PYTHONPATH}" python ${PAREVAL_ROOT}/drivers/run-all.py "$@" +fi \ No newline at end of file diff --git a/drivers/build-configs.json b/drivers/build-configs.json new file mode 100644 index 0000000..4b15102 --- /dev/null +++ b/drivers/build-configs.json @@ -0,0 +1,9 @@ +{ + "serial": {"CXX": "g++", "CXXFLAGS": "-std=c++17 -O3"}, + "omp": {"CXX": "g++", "CXXFLAGS": "-std=c++17 -O3 -fopenmp"}, + "mpi": {"CXX": "mpicxx", "CXXFLAGS": "-std=c++17 -O3"}, + "mpi+omp": {"CXX": "mpicxx", "CXXFLAGS": "-std=c++17 -O3 -fopenmp"}, + "kokkos": {"CXX": "g++", "CXXFLAGS": "-std=c++17 -O3 -fopenmp -I../tpl/kokkos/build/include ../tpl/kokkos/build/lib64/libkokkoscore.a ../tpl/kokkos/build/lib64/libkokkoscontainers.a ../tpl/kokkos/build/lib64/libkokkossimd.a"}, + "cuda": {"CXX": "nvcc", "CXXFLAGS": "-std=c++17 --generate-code arch=compute_80,code=sm_80 -O3 -Xcompiler \"-std=c++17 -O3\""}, + "hip": {"CXX": "hipcc", "CXXFLAGS": "-std=c++17 -O3 -Xcompiler \"-std=c++17\" -Xcompiler \"-O3\" -Wno-unused-result"} +} \ No newline at end of file diff --git a/drivers/cpp/benchmarks/geometry/10_geometry_convex_hull/baseline.hpp b/drivers/cpp/benchmarks/geometry/10_geometry_convex_hull/baseline.hpp index 2a7ac20..09cee35 100644 --- a/drivers/cpp/benchmarks/geometry/10_geometry_convex_hull/baseline.hpp +++ b/drivers/cpp/benchmarks/geometry/10_geometry_convex_hull/baseline.hpp @@ -29,6 +29,8 @@ void NO_INLINE correctConvexHull(std::vector const& points, std::vector

lowerHull; upperHull.push_back(pointsSorted[0]); upperHull.push_back(pointsSorted[1]); + lowerHull.push_back(pointsSorted[pointsSorted.size() - 1]); + lowerHull.push_back(pointsSorted[pointsSorted.size() - 2]); for (size_t i = 2; i < pointsSorted.size(); i++) { while (upperHull.size() > 1 @@ -47,7 +49,7 @@ void NO_INLINE correctConvexHull(std::vector const& points, std::vector

const& points) { std::vector lowerHull; upperHull.push_back(pointsSorted[0]); upperHull.push_back(pointsSorted[1]); + lowerHull.push_back(pointsSorted[pointsSorted.size() - 1]); + lowerHull.push_back(pointsSorted[pointsSorted.size() - 2]); for (size_t i = 2; i < pointsSorted.size(); i++) { while (upperHull.size() > 1 @@ -52,7 +54,7 @@ double NO_INLINE correctConvexHullPerimeter(std::vector const& points) { } lowerHull.push_back(pointsSorted[pointsSorted.size() - i - 1]); } - upperHull.insert(upperHull.end(), lowerHull.begin(), lowerHull.end()); + upperHull.insert(upperHull.end(), lowerHull.begin()+1, lowerHull.end()-1); double perimeter = 0; for (size_t i = 0; i < upperHull.size() - 1; i++) { diff --git a/drivers/cpp/benchmarks/sparse_la/48_sparse_la_sparse_axpy/cpu.cc b/drivers/cpp/benchmarks/sparse_la/48_sparse_la_sparse_axpy/cpu.cc index 45cb6a7..57718c0 100644 --- a/drivers/cpp/benchmarks/sparse_la/48_sparse_la_sparse_axpy/cpu.cc +++ b/drivers/cpp/benchmarks/sparse_la/48_sparse_la_sparse_axpy/cpu.cc @@ -122,7 +122,6 @@ bool validate(Context *ctx) { correctSparseAxpy(alpha, x, y, correct); // compute test result - test.clear(); sparseAxpy(alpha, x, y, test); SYNC(); diff --git a/drivers/cpp/cpp_driver_wrapper.py b/drivers/cpp/cpp_driver_wrapper.py index cc67026..c80381e 100644 --- a/drivers/cpp/cpp_driver_wrapper.py +++ b/drivers/cpp/cpp_driver_wrapper.py @@ -6,12 +6,10 @@ import copy import logging import os -from os import PathLike, environ -import shlex +from os import PathLike import subprocess import sys import tempfile -from typing import List # local imports sys.path.append("..") @@ -43,7 +41,7 @@ def build_kokkos(driver_src: PathLike, output_root: PathLike, problem_size: str = "(1<<20)"): """ Custom steps for the Kokkos programs, since they require cmake """ # cp cmake file into the output directory - cmake_path = "cpp/KokkosCMakeLists.txt" + cmake_path = os.path.join("cpp", "KokkosCMakeLists.txt") cmake_dest = os.path.join(output_root, "CMakeLists.txt") run_command(f"cp {cmake_path} {cmake_dest}", dry=False) @@ -59,6 +57,8 @@ class CppDriverWrapper(DriverWrapper): def __init__(self, **kwargs): super().__init__(**kwargs) + + self.build_configs = self.build_configs or COMPILER_SETTINGS self.model_driver_file = os.path.join("cpp", "models", DRIVER_MAP[self.parallelism_model]) def write_source(self, content: str, fpath: PathLike) -> bool: @@ -127,7 +127,7 @@ def test_single_output(self, prompt: str, output: str, test_driver_file: PathLik # compile and run the output exec_path = os.path.join(tmpdir, "a.out") - compiler_kwargs = copy.deepcopy(COMPILER_SETTINGS[self.parallelism_model]) + compiler_kwargs = copy.deepcopy(self.build_configs[self.parallelism_model]) compiler_kwargs["problem_size"] = problem_size # for kokkos compiler_kwargs["CXXFLAGS"] += f" -I{tmpdir} -DDRIVER_PROBLEM_SIZE=\"{problem_size}\"" build_result = self.compile(self.model_driver_file, test_driver_file, output_path=exec_path, **compiler_kwargs) diff --git a/drivers/driver_wrapper.py b/drivers/driver_wrapper.py index 0e18aa4..4c705f4 100644 --- a/drivers/driver_wrapper.py +++ b/drivers/driver_wrapper.py @@ -167,6 +167,7 @@ def __init__( self, parallelism_model: str = "serial", launch_configs: dict = {"format": "{exec_path} {args}", "params": [{}]}, + build_configs: Optional[dict] = None, problem_sizes: dict = {}, scratch_dir: Optional[PathLike] = None, build_timeout: int = 20, @@ -180,6 +181,7 @@ def __init__( self.validator = VALIDATORS[parallelism_model] self.scratch_dir = scratch_dir self.launch_configs = launch_configs[parallelism_model] + self.build_configs = build_configs self.problem_sizes = problem_sizes self.build_timeout = build_timeout self.run_timeout = run_timeout @@ -213,15 +215,15 @@ def test_single_output(self, prompt: str, output: str, test_driver_file: PathLik def test_all_outputs_in_prompt(self, prompt: dict) -> dict: """ Run all the generated outputs in the given prompt. """ - root = prompt["language"] + lang = prompt["language"] type = prompt["problem_type"] name = prompt["name"] ext = LANGUAGE_EXTENSIONS[prompt["language"]] - if root == "cpp" and self.parallelism_model in ["cuda", "hip"]: + if lang == "cpp" and self.parallelism_model in ["cuda", "hip"]: ext = ".cu" - driver_root = f"{name}" + driver_dirname = f"{name}" driver_base = DRIVER_MAP[self.parallelism_model] - test_driver_file = os.path.join(root, "benchmarks", type, driver_root, driver_base + ext) + test_driver_file = os.path.join(lang, "benchmarks", type, driver_dirname, driver_base + ext) problem_size = self.problem_sizes.get(name, {}).get(self.parallelism_model, "(1<<18)") outputs = [] diff --git a/drivers/run-all.py b/drivers/run-all.py index 048dfa4..afaad91 100755 --- a/drivers/run-all.py +++ b/drivers/run-all.py @@ -5,10 +5,10 @@ """ # std imports from argparse import ArgumentParser +import contextlib import json import logging import os -import tempfile from typing import Optional # tpl imports @@ -30,8 +30,11 @@ def get_args(): parser.add_argument("input_json", type=str, help="Input JSON file containing the test cases.") parser.add_argument("-o", "--output", type=str, help="Output JSON file containing the results.") parser.add_argument("--scratch-dir", type=str, help="If provided, put scratch files here.") + parser.add_argument("--driver-root", type=str, help="Where to look for the driver files, if not in cwd.") parser.add_argument("--launch-configs", type=str, default="launch-configs.json", help="config for how to run samples.") + parser.add_argument("--build-configs", type=str, default="build-configs.json", + help="config for how to build samples. If not provided, will use the default build settings for each model.") parser.add_argument("--problem-sizes", type=str, default="problem-sizes.json", help="config for how to run samples.") parser.add_argument("--yes-to-all", action="store_true", help="If provided, automatically answer yes to all prompts.") @@ -56,11 +59,19 @@ def get_args(): parser.add_argument("--log-runs", action="store_true", help="Display the stderr and stdout of runs.") return parser.parse_args() -def get_driver(prompt: dict, scratch_dir: Optional[os.PathLike], launch_configs: dict, problem_sizes: dict, dry: bool, **kwargs) -> DriverWrapper: +def get_driver( + prompt: dict, + scratch_dir: Optional[os.PathLike], + launch_configs: dict, + build_configs: dict, + problem_sizes: dict, + dry: bool, + **kwargs +) -> DriverWrapper: """ Get the language drive wrapper for this prompt """ driver_cls = LANGUAGE_DRIVERS[prompt["language"]] return driver_cls(parallelism_model=prompt["parallelism_model"], launch_configs=launch_configs, - problem_sizes=problem_sizes, scratch_dir=scratch_dir, dry=dry, **kwargs) + build_configs=build_configs, problem_sizes=problem_sizes, scratch_dir=scratch_dir, dry=dry, **kwargs) def already_has_results(prompt: dict) -> bool: """ Check if a prompt already has results stored in it. """ @@ -102,10 +113,25 @@ def main(): launch_configs = load_json(args.launch_configs) logging.info(f"Loaded launch configs from {args.launch_configs}.") + # load build configs + build_configs = load_json(args.build_configs) + logging.info(f"Loaded build configs from {args.build_configs}.") + # load problem sizes problem_sizes = load_json(args.problem_sizes) logging.info(f"Loaded problem sizes from {args.problem_sizes}.") + # set driver root; If provided, use user argument. If it's not provided, then check if the PAREVAL_ROOT environment + # variable is set, then use "${PAREVAL_ROOT}/drivers" as the root. If neither is set, then use the location of + # this script as the root. + if args.driver_root: + DRIVER_ROOT = args.driver_root + elif "PAREVAL_ROOT" in os.environ: + DRIVER_ROOT = os.path.join(os.environ["PAREVAL_ROOT"], "drivers") + else: + DRIVER_ROOT = os.path.dirname(os.path.abspath(__file__)) + logging.info(f"Using driver root: {DRIVER_ROOT}") + # gather the list of parallelism models to test models_to_test = args.include_models if args.include_models else ["serial", "omp", "mpi", "mpi+omp", "kokkos", "cuda", "hip"] if args.exclude_models: @@ -139,15 +165,18 @@ def main(): prompt, args.scratch_dir, launch_configs, + build_configs, problem_sizes, args.dry, display_build_errors=args.log_build_errors, display_runs=args.log_runs, early_exit_runs=args.early_exit_runs, build_timeout=args.build_timeout, - run_timeout=args.run_timeout + run_timeout=args.run_timeout, ) - driver.test_all_outputs_in_prompt(prompt) + + with contextlib.chdir(DRIVER_ROOT): + driver.test_all_outputs_in_prompt(prompt) # go ahead and write out outputs now if args.output and args.output != '-': diff --git a/drivers/test-serial-outputs.json b/drivers/test-serial-outputs.json index 6cd458b..2c3dcfb 100644 --- a/drivers/test-serial-outputs.json +++ b/drivers/test-serial-outputs.json @@ -1,24 +1,60 @@ [ { - "problem_type": "scan", + "problem_type": "geometry", "language": "cpp", - "name": "28_scan_sum_of_prefix_sum_array", + "name": "13_geometry_closest_pair_2d", "parallelism_model": "serial", - "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n Example:\n\n input: [-7, 2, 1, 9, 4, 8]\n output: 15\n*/\ndouble sumOfPrefixSum(std::vector const& x) {", + "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the distance between the closest two points in the vector points.\n Example: \n\n input: [{2, 3}, {12, 30}, {40, 50}, {5, 1}, {12, 10}, {3, 4}]\n output: 1.41421\n*/\ndouble closestPair(std::vector const& points) {", "outputs": [ - "\n std::vector prefixSum(x.size());\n std::inclusive_scan(x.begin(), x.end(), prefixSum.begin());\n return std::accumulate(prefixSum.begin(), prefixSum.end(), 0.0);\n}", + "\n // The polygon needs to have at least two points\n if (points.size() < 2) {\n return 0;\n }\n\n auto getDist = [](Point const& a, Point const& b) {\n return std::sqrt(std::pow(b.x-a.x, 2) + std::pow(b.y-a.y, 2));\n };\n\n double minDist = std::numeric_limits::max();\n for (size_t i = 0; i < points.size() - 1; i++) {\n for (size_t j = i + 1; j < points.size(); j++) {\n const double dist = getDist(points[i], points[j]);\n if (dist < minDist) {\n minDist = dist;\n }\n }\n }\n\n return minDist;\n}\n", " return 0; }", " undefinedFunction(); }" ] }, { - "problem_type": "transform", + "problem_type": "geometry", "language": "cpp", - "name": "53_transform_negate_odds", + "name": "11_geometry_convex_hull_perimeter", "parallelism_model": "serial", - "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n Example:\n\n input: [16, 11, 12, 14, 1, 0, 5]\n output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector &x) {", + "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble distance(Point const& p1, Point const& p2) {\n\treturn std::sqrt(std::pow(p2.x-p1.x, 2) + std::pow(p2.y-p1.y, 2));\n}\n\n/* Return the perimeter of the smallest convex polygon that contains all the points in the vector points.\n Example:\n\n input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n output: 13.4477\n*/\ndouble convexHullPerimeter(std::vector const& points) {", "outputs": [ - "\n std::transform(x.begin(), x.end(), x.begin(), [](int i) {\n if (i % 2 == 0) {\n return i / 2;\n } else {\n return -i;\n }\n });\n}", + "\n // The polygon needs to have at least three points\n if (points.size() < 3) {\n return 0;\n }\n\n std::vector pointsSorted = points;\n\n std::sort(pointsSorted.begin(), pointsSorted.end(), [](Point const& a, Point const& b) {\n return a.x < b.x || (a.x == b.x && a.y < b.y);\n });\n\n auto CrossProduct = [](Point const& a, Point const& b, Point const& c) {\n return (c.x - a.x) * (b.y - a.y) - (c.y - a.y) * (b.x - a.x) > 0;\n };\n\n auto dist = [](Point const& p1, Point const& p2) {\n return sqrt(pow(p2.x-p1.x, 2) + pow(p2.y-p1.y, 2));\n };\n\n\n std::vector upperHull;\n std::vector lowerHull;\n upperHull.push_back(pointsSorted[0]);\n upperHull.push_back(pointsSorted[1]);\n lowerHull.push_back(pointsSorted[pointsSorted.size() - 1]);\n lowerHull.push_back(pointsSorted[pointsSorted.size() - 2]);\n\n for (size_t i = 2; i < pointsSorted.size(); i++) {\n while (upperHull.size() > 1\n && !CrossProduct(upperHull[upperHull.size() - 2],\n upperHull[upperHull.size() - 1],\n pointsSorted[i])) {\n upperHull.pop_back();\n }\n upperHull.push_back(pointsSorted[i]);\n\n while (lowerHull.size() > 1\n && !CrossProduct(lowerHull[lowerHull.size() - 2],\n lowerHull[lowerHull.size() - 1],\n pointsSorted[pointsSorted.size() - i - 1])) {\n lowerHull.pop_back();\n }\n lowerHull.push_back(pointsSorted[pointsSorted.size() - i - 1]);\n }\n upperHull.insert(upperHull.end(), lowerHull.begin()+1, lowerHull.end()-1);\n\n double perimeter = 0;\n for (size_t i = 0; i < upperHull.size() - 1; i++) {\n perimeter += dist(upperHull[i], upperHull[i+1]);\n }\n perimeter += dist(upperHull[0], upperHull[upperHull.size() - 1]);\n\n return perimeter;\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "geometry", + "language": "cpp", + "name": "12_geometry_smallest_triangle", + "parallelism_model": "serial", + "prompt": "struct Point {\n\tdouble x, y;\n};\n\ndouble triangleArea(Point const& A, Point const& B, Point const& C) {\n return 0.5 * std::abs( A.x*(B.y-C.y) + B.x*(C.y-A.y) + C.x*(A.y-B.y) );\n}\n\n/* Return the area of the smallest triangle that can be formed by any 3 points.\n Example:\n\n input: [{0, 10}, {5, 5}, {1,0}, {-1, 1}, {-10, 0}]\n output: 5.5\n*/\ndouble smallestArea(std::vector const& points) {", + "outputs": [ + "\n // The polygon needs to have at least three points\n if (points.size() < 3) {\n return 0;\n }\n\n auto triArea = [](Point const& a, Point const& b, Point const& c) {\n return 0.5 * std::abs((a.x * (b.y - c.y) + b.x * (c.y - a.y) + c.x * (a.y - b.y)));\n };\n\n double minArea = std::numeric_limits::max();\n for (size_t i = 0; i < points.size() - 2; i++) {\n for (size_t j = i + 1; j < points.size() - 1; j++) {\n for (size_t k = j + 1; k < points.size(); k++) {\n const double area = triArea(points[i], points[j], points[k]);\n if (area < minArea) {\n minArea = area;\n }\n }\n }\n }\n\n return minArea;\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "geometry", + "language": "cpp", + "name": "14_geometry_closest_pair_1d", + "parallelism_model": "serial", + "prompt": "double distance(double x1, double x2) {\n\treturn std::abs(x1 - x2);\n}\n\n/* Return the distance between the closest two elements in the vector x.\n Example: \n\n input: [7, 3, 9, 12, 31, 1]\n output: 2\n*/\ndouble closestPair(std::vector const& x) {", + "outputs": [ + "\n // The vector needs to have at least two elements\n if (x.size() < 2) {\n return 0;\n }\n\n auto getDist = [](double const& a, double const& b) {\n return std::abs(b-a);\n };\n\n double minDist = std::numeric_limits::max();\n for (size_t i = 0; i < x.size() - 1; i++) {\n for (size_t j = i + 1; j < x.size(); j++) {\n const double dist = getDist(x[i], x[j]);\n if (dist < minDist) {\n minDist = dist;\n }\n }\n }\n\n return minDist;\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "geometry", + "language": "cpp", + "name": "10_geometry_convex_hull", + "parallelism_model": "serial", + "prompt": "struct Point {\n double x, y;\n};\n\n/* Find the set of points that defined the smallest convex polygon that contains all the points in the vector points. Store the result in `hull`.\n Example:\n\n input: [{0, 3}, {1, 1}, {2, 2}, {4, 4}, {0, 0}, {1, 2}, {3, 1}, {3, 3}]\n output: [{0, 3}, {4, 4}, {3, 1}, {0, 0}]\n*/\nvoid convexHull(std::vector const& points, std::vector &hull) {", + "outputs": [ + "\n // The polygon needs to have at least three points\n if (points.size() < 3) {\n hull = points;\n return;\n }\n\n std::vector pointsSorted = points;\n\n std::sort(pointsSorted.begin(), pointsSorted.end(), [](Point const& a, Point const& b) {\n return a.x < b.x || (a.x == b.x && a.y < b.y);\n });\n\n auto CrossProduct = [](Point const& a, Point const& b, Point const& c) {\n return (c.x - a.x) * (b.y - a.y) - (c.y - a.y) * (b.x - a.x) > 0;\n };\n\n std::vector upperHull;\n std::vector lowerHull;\n upperHull.push_back(pointsSorted[0]);\n upperHull.push_back(pointsSorted[1]);\n lowerHull.push_back(pointsSorted[pointsSorted.size() - 1]);\n lowerHull.push_back(pointsSorted[pointsSorted.size() - 2]);\n\n for (size_t i = 2; i < pointsSorted.size(); i++) {\n while (upperHull.size() > 1\n && !CrossProduct(upperHull[upperHull.size() - 2],\n upperHull[upperHull.size() - 1],\n pointsSorted[i])) {\n upperHull.pop_back();\n }\n upperHull.push_back(pointsSorted[i]);\n\n while (lowerHull.size() > 1\n && !CrossProduct(lowerHull[lowerHull.size() - 2],\n lowerHull[lowerHull.size() - 1],\n pointsSorted[pointsSorted.size() - i - 1])) {\n lowerHull.pop_back();\n }\n lowerHull.push_back(pointsSorted[pointsSorted.size() - i - 1]);\n }\n upperHull.insert(upperHull.end(), lowerHull.begin()+1, lowerHull.end()-1);\n\n hull = upperHull;\n return;\n}\n", " }", " undefinedFunction(); }" ] @@ -26,11 +62,11 @@ { "problem_type": "transform", "language": "cpp", - "name": "56_transform_map_function", + "name": "59_transform_map_function", "parallelism_model": "serial", "prompt": "bool isPowerOfTwo(int x) {\n\treturn (x > 0) && !(x & (x - 1));\n}\n\n/* Apply the isPowerOfTwo function to every value in x and store the results in mask.\n Example:\n\n input: [8, 0, 9, 7, 15, 64, 3]\n output: [true, false, false, false, false, true, false]\n*/\nvoid mapPowersOfTwo(std::vector const& x, std::vector &mask) {", "outputs": [ - "\n for (int i = 0; i < x.size(); i++) {\n mask[i] = isPowerOfTwo(x[i]);\n }\n}", + "\n for (int i = 0; i < x.size(); i++) {\n #if defined(USE_CUDA) || defined(USE_HIP)\n mask[i] = isPowerOfTwoHOST(x[i]);\n #else\n mask[i] = isPowerOfTwo(x[i]);\n #endif\n }\n}", " }", " undefinedFunction(); }" ] @@ -38,11 +74,11 @@ { "problem_type": "transform", "language": "cpp", - "name": "55_transform_squaring", + "name": "57_transform_inverse_offset", "parallelism_model": "serial", - "prompt": "/* Replace every element of x with the square of its value.\n Example:\n\n input: [5, 1, 2, -4, 8]\n output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector &x) {", + "prompt": "/* Replace every element of the vector x with 1-1/x.\n Example:\n\n input: [2, 4, 1, 12, -2]\n output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector &x) {", "outputs": [ - "\n for (size_t i = 0; i < x.size(); i++) {\n x[i] = x[i] * x[i];\n }\n}", + "\n std::transform(x.begin(), x.end(), x.begin(), [](double x) { return 1.0 - 1.0 / x; });\n}", " }", " undefinedFunction(); }" ] @@ -50,7 +86,7 @@ { "problem_type": "transform", "language": "cpp", - "name": "52_transform_relu", + "name": "55_transform_relu", "parallelism_model": "serial", "prompt": "/* Compute the ReLU function on every element of x. Elements less than zero become zero,\n while elements greater than zero stay the same.\n Example:\n\n input: [-1.8, 24.0, 1.2, 0.0, -5.1, -0.2, 4.5]\n output: [0, 24.0, 1.2, 0, 0, 0, 4.5]\n*/\nvoid relu(std::vector &x) {", "outputs": [ @@ -62,11 +98,167 @@ { "problem_type": "transform", "language": "cpp", - "name": "54_transform_inverse_offset", + "name": "56_transform_negate_odds", "parallelism_model": "serial", - "prompt": "/* Replace every element of the vector x with 1-1/x.\n Example:\n\n input: [2, 4, 1, 12, -2]\n output: [0.5, 0.75, 0, 0.91666666, 1.5]\n*/\nvoid oneMinusInverse(std::vector &x) {", + "prompt": "/* In the vector x negate the odd values and divide the even values by 2.\n Example:\n\n input: [16, 11, 12, 14, 1, 0, 5]\n output: [8, -11, 6, 7, -1, 0, -5]\n*/\nvoid negateOddsAndHalveEvens(std::vector &x) {", "outputs": [ - "\n std::transform(x.begin(), x.end(), x.begin(), [](double x) { return 1.0 - 1.0 / x; });\n}", + "\n std::transform(x.begin(), x.end(), x.begin(), [](int i) {\n if (i % 2 == 0) {\n return i / 2;\n } else {\n return -i;\n }\n });\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "transform", + "language": "cpp", + "name": "58_transform_squaring", + "parallelism_model": "serial", + "prompt": "/* Replace every element of x with the square of its value.\n Example:\n\n input: [5, 1, 2, -4, 8]\n output: [25, 1, 4, 16, 64]\n*/\nvoid squareEach(std::vector &x) {", + "outputs": [ + "\n for (size_t i = 0; i < x.size(); i++) {\n x[i] = x[i] * x[i];\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "reduce", + "language": "cpp", + "name": "26_reduce_product_of_inverses", + "parallelism_model": "serial", + "prompt": "/* Return the product of the vector x with every odd indexed element inverted.\n i.e. x_0 * 1/x_1 * x_2 * 1/x_3 * x_4 ...\n Example:\n\n input: [4, 2, 10, 4, 5]\n output: 25\n*/\ndouble productWithInverses(std::vector const& x) {", + "outputs": [ + "\n std::vector data;\n for (size_t i = 0; i < x.size(); i++)\n data.push_back(i % 2 ? 1.0 / x[i] : x[i]);\n return std::reduce(data.begin(), data.end(), 1.0, std::multiplies());\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "reduce", + "language": "cpp", + "name": "27_reduce_average", + "parallelism_model": "serial", + "prompt": "/* Return the average of the vector x.\n Examples:\n\t\t\n\t input: [1, 8, 4, 5, 1]\n output: 3.8\n\n input: [2, 2, 2, 3]\n output: 2.25\n*/\ndouble average(std::vector const& x) {", + "outputs": [ + "\n return std::reduce(x.begin(), x.end(), 0.0) / (double) x.size();\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "reduce", + "language": "cpp", + "name": "28_reduce_smallest_odd_number", + "parallelism_model": "serial", + "prompt": "/* Return the value of the smallest odd number in the vector x.\n Examples:\n\n input: [7, 9, 5, 2, 8, 16, 4, 1]\n output: 1\n\n input: [8, 36, 7, 2, 11]\n output: 7\n*/\nint smallestOdd(std::vector const& x) {", + "outputs": [ + "\n return std::reduce(x.begin(), x.end(), std::numeric_limits::max(), [] (const auto &a, const auto &b) {\n if (a < b) {\n if (a % 2 == 1) return a;\n else if (b % 2 == 1) return b;\n else return std::numeric_limits::max();\n } else {\n if (b % 2 == 1) return b;\n else if (a % 2 == 1) return a;\n else return std::numeric_limits::max();\n }\n });\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "reduce", + "language": "cpp", + "name": "25_reduce_xor", + "parallelism_model": "serial", + "prompt": "/* Return the logical XOR reduction of the vector of bools x.\n Example:\n\n input: [false, false, false, true]\n output: true\n*/\nbool reduceLogicalXOR(std::vector const& x) {", + "outputs": [ + "\n return std::reduce(x.begin(), x.end(), false, [] (const auto &a, const auto &b) {\n return a != b;\n });\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "reduce", + "language": "cpp", + "name": "29_reduce_sum_of_min_of_pairs", + "parallelism_model": "serial", + "prompt": "/* Return the sum of the minimum value at each index of vectors x and y for all indices.\n i.e. sum = min(x_0, y_0) + min(x_1, y_1) + min(x_2, y_2) + ...\n Example:\n\n input: x=[3, 4, 0, 2, 3], y=[2, 5, 3, 1, 7]\n output: 10\n*/\ndouble sumOfMinimumElements(std::vector const& x, std::vector const& y) {", + "outputs": [ + "\n std::vector z;\n z.resize(x.size());\n std::transform(x.begin(), x.end(), y.begin(), z.begin(), [] (const auto &a, const auto &b) {\n return std::min(a, b);\n });\n return std::reduce(z.begin(), z.end());\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "dense_la", + "language": "cpp", + "name": "00_dense_la_lu_decomp", + "parallelism_model": "serial", + "prompt": "/* Factorize the matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n Store the results for L and U into the original matrix A. \n A is an NxN matrix stored in row-major.\n Example:\n\n input: [[4, 3], [6, 3]]\n output: [[4, 3], [1.5, -1.5]]\n*/\nvoid luFactorize(std::vector &A, size_t N) {", + "outputs": [ + "\n for (size_t k = 0; k < N; ++k) {\n for (size_t i = k + 1; i < N; ++i) {\n\n double factor = A[i * N + k] / A[k * N + k];\n A[i * N + k] = factor;\n \n for (size_t j = k + 1; j < N; ++j) {\n A[i * N + j] -= factor * A[k * N + j];\n }\n }\n }\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "dense_la", + "language": "cpp", + "name": "04_dense_la_gemv", + "parallelism_model": "serial", + "prompt": "/* Multiply the matrix A by the vector x. Store the results in the vector y.\n A is an MxN matrix stored in row-major, x has N elements, and y has M elements.\n Example:\n\n input: A=[[1, -1, 2], [0, -3, 1]] x=[2, 1, 0]\n output: y=[1, -3]\n*/\nvoid gemv(std::vector const& A, std::vector const& x, std::vector &y, size_t M, size_t N) {", + "outputs": [ + "\n for (size_t i = 0; i < M; i++) {\n y[i] = 0;\n for (size_t j = 0; j < N; j++) {\n y[i] += A[i * N + j] * x[j];\n }\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "dense_la", + "language": "cpp", + "name": "02_dense_la_gemm", + "parallelism_model": "serial", + "prompt": "/* Multiply the matrix A by the matrix B. Store the results in the matrix C.\n A is an MxK matrix, B is a KxN matrix, and C is a MxN matrix. The matrices are stored in row-major.\n Example:\n\n input: A=[[1, -1, 2], [0, -2, 1]] B=[[4, 1], [-1, 0], [2, 2]]\n output: C=[[9, 5], [4, 2]]\n*/\nvoid gemm(std::vector const& A, std::vector const& B, std::vector &C, size_t M, size_t K, size_t N) {", + "outputs": [ + "\n for (size_t i = 0; i < M; i += 1) {\n for (size_t k = 0; k < K; k += 1) {\n for (size_t j = 0; j < N; j += 1) {\n C[i*N + j] += A[i*K + k] * B[k*N + j];\n }\n }\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "dense_la", + "language": "cpp", + "name": "01_dense_la_solve", + "parallelism_model": "serial", + "prompt": "/* Solve the linear system Ax=b for x.\n A is an NxN matrix in row-major. x and b have N elements.\n Example:\n \n input: A=[[1,4,2], [1,2,3], [2,1,3]] b=[11, 11, 13]\n output: x=[3, 1, 2]\n*/\nvoid solveLinearSystem(std::vector const& A, std::vector const& b, std::vector &x, size_t N) {", + "outputs": [ + "\n // Create a copy of A to perform Gaussian elimination\n std::vector A_copy = A;\n std::vector b_copy = b;\n\n // Gaussian elimination\n for (size_t i = 0; i < N - 1; i++) {\n // Find the pivot element\n double pivot = A_copy[i * N + i];\n\n // Check if the pivot is zero\n if (pivot == 0) {\n return;\n }\n\n // Eliminate the elements below the pivot\n for (size_t j = i + 1; j < N; j++) {\n double factor = A_copy[j * N + i] / pivot;\n for (size_t k = i; k < N; k++) {\n A_copy[j * N + k] -= factor * A_copy[i * N + k];\n }\n b_copy[j] -= factor * b_copy[i];\n }\n }\n\n // Back substitution\n for (int i = N - 1; i >= 0; i--) {\n double sum = 0;\n for (size_t j = i + 1; j < N; j++) {\n sum += A_copy[i * N + j] * x[j];\n }\n x[i] = (b_copy[i] - sum) / A_copy[i * N + i];\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "dense_la", + "language": "cpp", + "name": "03_dense_la_axpy", + "parallelism_model": "serial", + "prompt": "/* Compute z = alpha*x+y where x and y are vectors. Store the result in z.\n Example:\n \n input: x=[1, -5, 2, 9] y=[0, 4, 1, -1] alpha=2\n output: z=[2, -6, 5, 17]\n*/\nvoid axpy(double alpha, std::vector const& x, std::vector const& y, std::vector &z) {", + "outputs": [ + "\n for (size_t i = 0; i < x.size(); i += 1) {\n z[i] = alpha*x[i] + y[i];\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sort", + "language": "cpp", + "name": "41_sort_k-th_smallest_element", + "parallelism_model": "serial", + "prompt": "/* Find the k-th smallest element of the vector x.\n Example:\n \n input: x=[1, 7, 6, 0, 2, 2, 10, 6], k=4\n output: 6\n*/\nint findKthSmallest(std::vector const& x, int k) {", + "outputs": [ + "\n std::vector x_copy = x;\n std::sort(x_copy.begin(), x_copy.end());\n return x_copy[k-1];\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sort", + "language": "cpp", + "name": "43_sort_sort_an_array_of_structs_by_key", + "parallelism_model": "serial", + "prompt": "struct Result {\n int startTime, duration;\n float value;\n};\n\n/* Sort vector of Result structs by start time in ascending order.\n Example:\n \n input: [{startTime=8, duration=4, value=-1.22}, {startTime=2, duration=10, value=1.0}, {startTime=10, duration=3, value=0.0}]\n output: [{startTime=2, duration=10, value=1.0}, {startTime=8, duration=4, value=-1.22}, {startTime=10, duration=3, value=0.0}]\n*/\nvoid sortByStartTime(std::vector &results) {", + "outputs": [ + "\n std::sort(results.begin(), results.end(), [](Result const& a, Result const& b) {\n return a.startTime < b.startTime;\n });\n}", " }", " undefinedFunction(); }" ] @@ -74,7 +266,7 @@ { "problem_type": "sort", "language": "cpp", - "name": "41_sort_sort_non-zero_elements", + "name": "44_sort_sort_non-zero_elements", "parallelism_model": "serial", "prompt": "/* Sort the vector x in ascending order ignoring elements with value 0.\n Leave zero valued elements in-place.\n Example:\n\n\t input: [8, 4, 0, 9, 8, 0, 1, -1, 7]\n output: [-1, 1, 0, 4, 7, 0, 8, 8, 9]\n*/\nvoid sortIgnoreZero(std::vector &x) {", "outputs": [ @@ -82,5 +274,449 @@ " }", " undefinedFunction(); }" ] + }, + { + "problem_type": "sort", + "language": "cpp", + "name": "40_sort_sort_an_array_of_complex_numbers_by_magnitude", + "parallelism_model": "serial", + "prompt": "/* Sort the vector x of complex numbers by their magnitude in ascending order.\n Example:\n \n input: [3.0-1.0i, 4.5+2.1i, 0.0-1.0i, 1.0-0.0i, 0.5+0.5i]\n output: [0.5+0.5i, 0.0-1.0i, 1.0-0.0i, 3.0-1.0i, 4.5+2.1i]\n*/\nvoid sortComplexByMagnitude(std::vector> &x) {", + "outputs": [ + "\n std::sort(x.begin(), x.end(), [](const std::complex &a, const std::complex &b) {\n return std::abs(a) < std::abs(b);\n });\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sort", + "language": "cpp", + "name": "42_sort_sorted_ranks", + "parallelism_model": "serial", + "prompt": "/* For each value in the vector x compute its index in the sorted vector.\n Store the results in `ranks`.\n Examples:\n\n input: [3.1, 2.8, 9.1, 0.4, 3.14]\n output: [2, 1, 4, 0, 3]\n \n input: [100, 7.6, 16.1, 18, 7.6]\n output: [4, 0, 1, 2, 3]\n*/\nvoid ranks(std::vector const& x, std::vector &ranks) {", + "outputs": [ + "\n std::vector indices(x.size());\n std::iota(indices.begin(), indices.end(), 0);\n\n std::sort(indices.begin(), indices.end(), [&x](size_t i1, size_t i2) {\n return x[i1] < x[i2];\n });\n\n for (int i = 0; i < indices.size(); i += 1) {\n ranks[indices[i]] = i;\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sparse_la", + "language": "cpp", + "name": "46_sparse_la_spmm", + "parallelism_model": "serial", + "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Compute the matrix multiplication Y=AX. A is a sparse MxK matrix in COO format.\n X is a sparse KxN matrix in COO format. Y is a dense MxN matrix in row-major.\n Example:\n\n input: A=[{0,0,-2}, {0,1,1}, {1,1,-1}] X=[{0,1,2}, {1,0,-1}]\n output: Y=[{-1,-4}, {1,0}]\n*/\nvoid spmm(std::vector const& A, std::vector const& X, std::vector &Y, size_t M, size_t K, size_t N) {", + "outputs": [ + "\n Y.assign(M * N, 0);\n\n for (const auto& a : A) {\n for (const auto& x : X) {\n if (a.column == x.row) {\n Y[a.row * N + x.column] += a.value * x.value;\n }\n }\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sparse_la", + "language": "cpp", + "name": "47_sparse_la_spmv", + "parallelism_model": "serial", + "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Compute y = alpha*A*x + beta*y where alpha and beta are scalars, x and y are vectors,\n and A is a sparse matrix stored in COO format.\n A has dimensions MxN, x has N values, and y has M values.\n Example:\n\n input: alpha=0.5 beta=1.0 A=[{0,1,3}, {1,0,-1}] x=[-4, 2] y=[-1,1]\n output: y=[2, 3]\n*/\nvoid spmv(double alpha, std::vector const& A, std::vector const& x, double beta, std::vector &y, size_t M, size_t N) {", + "outputs": [ + "\n for (auto& element : y) {\n element *= beta;\n }\n \n for (const auto& a_ij : A) {\n if (a_ij.row < M && a_ij.column < N) {\n y[a_ij.row] += alpha * a_ij.value * x[a_ij.column];\n }\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sparse_la", + "language": "cpp", + "name": "48_sparse_la_sparse_axpy", + "parallelism_model": "serial", + "prompt": "struct Element {\n\tsize_t index;\n double value;\n};\n\n/* Compute z = alpha*x+y where x and y are sparse vectors. Store the result in the dense vector z.\n Example:\n \n input: x=[{5, 12}, {8, 3}, {12, -1}], y=[{3, 1}, {5, -2}, {7, 1}, {8, -3}], alpha=1\n output: z=[0, 0, 0, 1, 0, 10, 0, 1, 0, 0, 0, 0, -1]\n*/\nvoid sparseAxpy(double alpha, std::vector const& x, std::vector const& y, std::vector &z) {", + "outputs": [ + "\n size_t xi = 0, yi = 0;\n\n while (xi < x.size() && yi < y.size()) {\n if (x[xi].index < y[yi].index) {\n z[x[xi].index] += alpha * x[xi].value;\n ++xi;\n } else if (x[xi].index > y[yi].index) {\n z[y[yi].index] += y[yi].value;\n ++yi;\n } else {\n z[x[xi].index] += alpha * x[xi].value + y[yi].value;\n ++xi;\n ++yi;\n }\n }\n\n while (xi < x.size()) {\n z[x[xi].index] += alpha * x[xi].value;\n ++xi;\n }\n\n while (yi < y.size()) {\n z[y[yi].index] += y[yi].value;\n ++yi;\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sparse_la", + "language": "cpp", + "name": "45_sparse_la_sparse_solve", + "parallelism_model": "serial", + "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Solve the sparse linear system Ax=b for x.\n A is a sparse NxN matrix in COO format. x and b are dense vectors with N elements.\n Example:\n \n input: A=[{0,0,1}, {0,1,1}, {1,1,-2}] b=[1,4]\n output: x=[3,-2]\n*/\nvoid solveLinearSystem(std::vector const& A, std::vector const& b, std::vector &x, size_t N) {", + "outputs": [ + "\n std::vector> matrix(N, std::vector(N, 0.0)); \n std::vector b_copy = b;\n\n // Fill the matrix with the values from A\n for (const auto& element : A) {\n matrix[element.row][element.column] = element.value;\n }\n\n // Initialize x with the size N\n x.assign(N, 0.0);\n\n // Perform Gaussian elimination\n for (size_t i = 0; i < N; ++i) {\n // Find pivot\n double maxEl = std::abs(matrix[i][i]);\n size_t maxRow = i;\n for (size_t k = i + 1; k < N; ++k) {\n if (std::abs(matrix[k][i]) > maxEl) {\n maxEl = std::abs(matrix[k][i]);\n maxRow = k;\n }\n }\n\n // Swap maximum row with current row (column by column)\n for (size_t k = i; k < N; ++k) {\n std::swap(matrix[maxRow][k], matrix[i][k]);\n }\n std::swap(b_copy[maxRow], b_copy[i]);\n\n // Make all rows below this one 0 in the current column\n for (size_t k = i + 1; k < N; ++k) {\n double c = -matrix[k][i] / matrix[i][i];\n for (size_t j = i; j < N; ++j) {\n if (i == j) {\n matrix[k][j] = 0;\n } else {\n matrix[k][j] += c * matrix[i][j];\n }\n }\n b_copy[k] += c * b_copy[i];\n }\n }\n\n // Solve equation Ax=b for an upper triangular matrix A\n for (int i = N - 1; i >= 0; --i) {\n x[i] = b_copy[i] / matrix[i][i];\n for (int k = i - 1; k >= 0; --k) {\n b_copy[k] -= matrix[k][i] * x[i];\n }\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "sparse_la", + "language": "cpp", + "name": "49_sparse_la_sparse_lu_decomp", + "parallelism_model": "serial", + "prompt": "struct COOElement {\n size_t row, column;\n double value;\n};\n\n/* Factorize the sparse matrix A into A=LU where L is a lower triangular matrix and U is an upper triangular matrix.\n A is a sparse NxN matrix stored in COO format.\n Example:\n\n input: A=[{0,0,4}, {0,1,3}, {1,0,6}, {1,1,3}]\n output: L=[{0,0,1},{1,0,1.5}, {1,1,1}] U=[{0,0,4}, {0,1,3}, {1,1,-1.5}]\n*/\nvoid luFactorize(std::vector const& A, std::vector &L, std::vector &U, size_t N) {", + "outputs": [ + "\n std::vector> fullA(N, std::vector(N, 0));\n for (const auto& element : A) {\n fullA[element.row][element.column] = element.value;\n }\n \n // LU factorization algorithm\n for (size_t i = 0; i < N; ++i) {\n for (size_t j = 0; j < N; ++j) {\n if (j >= i) {\n U[i * N + j] = fullA[i][j];\n for (size_t k = 0; k < i; ++k) {\n U[i * N + j] -= L[i * N + k] * U[k * N + j];\n }\n }\n if (i > j) {\n L[i * N + j] = fullA[i][j] / U[j * N + j];\n for (size_t k = 0; k < j; ++k) {\n L[i * N + j] -= L[i * N + k] * U[k * N + j] / U[j * N + j];\n }\n }\n }\n L[i * N + i] = 1;\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "stencil", + "language": "cpp", + "name": "50_stencil_xor_kernel", + "parallelism_model": "serial", + "prompt": "/* Set every cell's value to 1 if it has exactly one neighbor that's a 1. Otherwise set it to 0.\n Note that we only consider neighbors and not input_{i,j} when computing output_{i,j}.\n input and output are NxN grids of ints in row-major.\n Example:\n\n input: [[0, 1, 1, 0],\n [1, 0, 0, 0],\n [0, 0, 0, 0],\n [0, 1, 0, 0]\n output: [[0, 0, 1, 1],\n [1, 0, 0, 1],\n [0, 0, 1, 0],\n [1, 0, 1, 0]]\n*/\nvoid cellsXOR(std::vector const& input, std::vector &output, size_t N) {", + "outputs": [ + "\n for (size_t i = 0; i < N; ++i) {\n for (size_t j = 0; j < N; ++j) {\n int count = 0;\n if (i > 0 && input[(i-1)*N + j] == 1) ++count;\n if (i < N-1 && input[(i+1)*N + j] == 1) ++count;\n if (j > 0 && input[i*N + j-1] == 1) ++count;\n if (j < N-1 && input[i*N + j+1] == 1) ++count;\n output[i*N + j] = (count == 1) ? 1 : 0;\n }\n }\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "stencil", + "language": "cpp", + "name": "51_stencil_edge_kernel", + "parallelism_model": "serial", + "prompt": "const int edgeKernel[3][3] = {{-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}};\n\n/* Convolve the edge kernel with a grayscale image. Each pixel will be replaced with\n the dot product of itself and its neighbors with the edge kernel.\n Use a value of 0 for pixels outside the image's boundaries and clip outputs between 0 and 255.\n imageIn and imageOut are NxN grayscale images stored in row-major.\n Store the output of the computation in imageOut.\n Example:\n\n input: [[112, 118, 141, 152],\n [93, 101, 119, 203],\n [45, 17, 16, 232],\n [82, 31, 49, 101]]\n output: [[255, 255, 255, 255],\n [255, 147, 0, 255],\n [36, 0, 0, 255],\n [255, 39, 0, 255]]\n*/\nvoid convolveKernel(std::vector const& imageIn, std::vector &imageOut, size_t N) {", + "outputs": [ + "\n for (size_t i = 0; i < N; i++) {\n for (size_t j = 0; j < N; j++){\n int sum = 0;\n for (int k = -1; k < 2; k++) {\n for (int l = -1; l < 2; l++){\n int x = i + k;\n int y = j + l;\n if ((x < 0) || (x >= N) || (y < 0) || (y >= N)) {\n sum += 0;\n } else {\n sum += imageIn[x * N + y] * edgeKernel[k + 1][l + 1];\n }\n }\n }\n if (sum < 0) {\n imageOut[i * N + j] = 0;\n } else if (sum > 255) {\n imageOut[i * N + j] = 255;\n } else {\n imageOut[i * N + j] = sum;\n }\n }\n }\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "stencil", + "language": "cpp", + "name": "53_stencil_2d_jacobi_5-point_stencil", + "parallelism_model": "serial", + "prompt": "/* Compute one iteration of a 5-point 2D jacobi stencil on `input`. Store the results in `output`.\n Each element of `input` will be averaged with its four neighbors and stored in the corresponding element of `output`.\n i.e. output_{i,j} = (input_{i,j-1} + input_{i,j+1} + input_{i-1,j} + input_{i+1,j} + input_{i,j})/5\n Replace with 0 when reading past the boundaries of `input`.\n `input` and `output` are NxN grids stored in row-major.\n Example:\n\n input: [[3, 4, 1], [0, 1, 7], [5, 3, 2]]\n output: [[1.4, 1.8, 2.4],[1.8, 3, 2.2], [1.6, 2.2, 2.4]]\n*/\nvoid jacobi2D(std::vector const& input, std::vector &output, size_t N) {", + "outputs": [ + "\n for (size_t i = 0; i < N; i++) {\n for (size_t j = 0; j < N; j++) {\n double sum = 0;\n if (i > 0) {\n sum += input[(i - 1) * N + j];\n }\n if (i < N - 1) {\n sum += input[(i + 1) * N + j];\n }\n if (j > 0) {\n sum += input[i * N + (j - 1)];\n }\n if (j < N - 1) {\n sum += input[i * N + (j + 1)];\n }\n sum += input[i * N + j];\n output[i * N + j] = sum / 5.0;\n }\n }\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "stencil", + "language": "cpp", + "name": "52_stencil_1d_jacobi_3-point_stencil", + "parallelism_model": "serial", + "prompt": "/* Compute one iteration of a 3-point 1D jacobi stencil on `input`. Store the results in `output`.\n Each element of `input` will be averaged with its two neighbors and stored in the corresponding element of `output`.\n i.e. output[i] = (input[i-1]+input[i]+input[i+1])/3\n Replace with 0 when reading past the boundaries of `input`.\n Example:\n\n input: [9, -6, -1, 2, 3]\n output: [1, 2/3, -5/3, 4/3, 5/3]\n*/\nvoid jacobi1D(std::vector const& input, std::vector &output) {", + "outputs": [ + "\n for (size_t i = 0; i < input.size(); i++) {\n double sum = 0.0;\n if (i > 0) {\n sum += input[i - 1];\n }\n if (i < input.size() - 1) {\n sum += input[i + 1];\n }\n sum += input[i];\n output[i] = sum / 3.0;\n }\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "stencil", + "language": "cpp", + "name": "54_stencil_game_of_life", + "parallelism_model": "serial", + "prompt": "/* Simulate one generation of Game of Life on `input`. Store the results in `output`.\n A cell is 1 if it is alive and 0 if it is dead.\n If a live cell has fewer than 2 live neighbors then it dies.\n If a live cell has 2 or 3 live neighbors then it lives on.\n If a live cell has more than 3 live neighbords then it dies.\n If a cell is dead and has exactly 3 live neighbors then it becomes alive.\n `input` and `output` are NxN grids stored in row-major.\n Example:\n\n input: [[0, 0, 0, 0, 0],\n\t\t\t\t\t [0, 1, 0, 0, 0],\n [0, 1, 1, 0, 0],\n [0, 0, 1, 1, 0],\n [0, 1, 0, 0, 0]]\n output: [[0, 0, 0, 0, 0],\n\t\t\t\t\t [0, 1, 1, 0, 0],\n [0, 1, 0, 1, 0],\n [0, 0, 0, 1, 0],\n [0, 0, 1, 0, 0]]\n*/\nvoid gameOfLife(std::vector const& input, std::vector &output, size_t N) {", + "outputs": [ + "\n for (size_t i = 0; i < N; i++) {\n for (size_t j = 0; j < N; j++) {\n int sum = 0;\n if (i > 0) {\n sum += input[(i - 1) * N + j];\n }\n if (i < N - 1) {\n sum += input[(i + 1) * N + j];\n }\n if (j > 0) {\n sum += input[i * N + (j - 1)];\n }\n if (j < N - 1) {\n sum += input[i * N + (j + 1)];\n }\n if (i > 0 && j > 0) {\n sum += input[(i - 1) * N + (j - 1)];\n }\n if (i > 0 && j < N - 1) {\n sum += input[(i - 1) * N + (j + 1)];\n }\n if (i < N - 1 && j > 0) {\n sum += input[(i + 1) * N + (j - 1)];\n }\n if (i < N - 1 && j < N - 1) {\n sum += input[(i + 1) * N + (j + 1)];\n }\n if (input[i * N + j] == 1) {\n if (sum < 2) {\n output[i * N + j] = 0;\n } else if (sum == 2 || sum == 3) {\n output[i * N + j] = 1;\n } else {\n output[i * N + j] = 0;\n }\n } else {\n if (sum == 3) {\n output[i * N + j] = 1;\n } else {\n output[i * N + j] = 0;\n }\n }\n }\n }\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "histogram", + "language": "cpp", + "name": "20_histogram_pixel_histogram", + "parallelism_model": "serial", + "prompt": "/* Count the number of pixels in image with each grayscale intensity.\n The vector `image` is a grayscale image with values 0-255.\n Store the results in `bins`.\n Example:\n \n input: image=[2, 116, 201, 11, 92, 92, 201, 4, 2]\n output: [0, 0, 2, 0, 1, ...]\n*/\n void pixelCounts(std::vector const& image, std::array &bins) {", + "outputs": [ + "\n for (int i = 0; i < image.size(); i += 1) {\n bins[image[i]] += 1;\n }\n}\n\n\n#if defined(USE_CUDA)\n// fix the issue where atomicAdd is not defined for size_t\nstatic_assert(sizeof(size_t) == sizeof(unsigned long long), \"size_t is not 64 bits\");\n\n__device__ __forceinline__ void atomicAdd(size_t* address, size_t val) {\n atomicAdd(reinterpret_cast(address), val);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "histogram", + "language": "cpp", + "name": "24_histogram_count_quartile", + "parallelism_model": "serial", + "prompt": "/* Count the number of doubles in the vector x that have a fractional part \n in [0, 0.25), [0.25, 0.5), [0.5, 0.75), and [0.75, 1). Store the counts in `bins`.\n Examples:\n\n input: [7.8, 4.2, 9.1, 7.6, 0.27, 1.5, 3.8]\n output: [2, 1, 2, 2]\n\n input: [1.9, 0.2, 0.6, 10.1, 7.4]\n output: [2, 1, 1, 1]\n*/\nvoid countQuartiles(std::vector const& x, std::array &bins) {", + "outputs": [ + "\n for (int i = 0; i < x.size(); i += 1) {\n const double val = x[i];\n const double frac = val - (int) val;\n if (frac < 0.25) {\n bins[0] += 1;\n } else if (frac < 0.5) {\n bins[1] += 1;\n } else if (frac < 0.75) {\n bins[2] += 1;\n } else {\n bins[3] += 1;\n }\n }\n}\n\n#if defined(USE_CUDA)\n// fix the issue where atomicAdd is not defined for size_t\nstatic_assert(sizeof(size_t) == sizeof(unsigned long long), \"size_t is not 64 bits\");\n\n__device__ __forceinline__ void atomicAdd(size_t* address, size_t val) {\n atomicAdd(reinterpret_cast(address), val);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "histogram", + "language": "cpp", + "name": "22_histogram_count_quadrants", + "parallelism_model": "serial", + "prompt": "struct Point {\n double x, y;\n};\n\n/* Count the number of cartesian points in each quadrant. The vector points contains a list of `Point` objects.\n Store the counts in `bins`.\n Example:\n\n input: [{x=1.5, y=0.1}, {x=-3, y=1.1}, {x=5, y=9}, {x=1.5, y=-1}, {x=3, y=-7}, {x=0.1, y=2}]\n output: [3, 1, 0, 2]\n*/\nvoid countQuadrants(std::vector const& points, std::array &bins) {", + "outputs": [ + "\n for (auto const& point : points) {\n if (point.x >= 0 && point.y >= 0) {\n bins[0] += 1;\n } else if (point.x < 0 && point.y >= 0) {\n bins[1] += 1;\n } else if (point.x < 0 && point.y < 0) {\n bins[2] += 1;\n } else if (point.x >= 0 && point.y < 0) {\n bins[3] += 1;\n }\n }\n}", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "histogram", + "language": "cpp", + "name": "23_histogram_first_letter_counts", + "parallelism_model": "serial", + "prompt": "/* For each letter in the alphabet, count the number of strings in the vector s that start with that letter.\n Assume all strings are in lower case. Store the output in `bins` array.\n Example:\n\n input: [\"dog\", \"cat\", \"xray\", \"cow\", \"code\", \"type\", \"flower\"]\n output: [0, 0, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]\n*/\nvoid firstLetterCounts(std::vector const& s, std::array &bins) {", + "outputs": [ + "\n for (int i = 0; i < s.size(); i += 1) {\n const char c = s[i][0];\n const int index = c - 'a';\n bins[index] += 1;\n }\n}\n\n#if defined(USE_CUDA)\n// fix the issue where atomicAdd is not defined for size_t\nstatic_assert(sizeof(size_t) == sizeof(unsigned long long), \"size_t is not 64 bits\");\n\n__device__ __forceinline__ void atomicAdd(size_t* address, size_t val) {\n atomicAdd(reinterpret_cast(address), val);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "histogram", + "language": "cpp", + "name": "21_histogram_bin_0-100", + "parallelism_model": "serial", + "prompt": "/* Vector x contains values between 0 and 100, inclusive. Count the number of\n values in [0,10), [10, 20), [20, 30), ... and store the counts in `bins`.\n Example:\n\n input: [7, 32, 95, 12, 39, 32, 11, 71, 70, 66]\n output: [1, 2, 0, 3, 0, 0, 1, 2, 0, 1]\n*/\nvoid binsBy10Count(std::vector const& x, std::array &bins) {", + "outputs": [ + "\n for (size_t i = 0; i < x.size(); i += 1) {\n size_t bin = static_cast(x[i] / 10);\n bin = std::min(bin, bins.size() - 1);\n bins[bin] += 1;\n }\n}\n\n\n#if defined(USE_CUDA)\n// fix the issue where atomicAdd is not defined for size_t\nstatic_assert(sizeof(size_t) == sizeof(unsigned long long), \"size_t is not 64 bits\");\n\n__device__ __forceinline__ void atomicAdd(size_t* address, size_t val) {\n atomicAdd(reinterpret_cast(address), val);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "search", + "language": "cpp", + "name": "39_search_xor_contains", + "parallelism_model": "serial", + "prompt": "/* Return true if `val` is only in one of vectors x or y.\n Return false if it is in both or neither.\n Examples:\n\n input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=7\n output: true\n\n input: x=[1,8,4,3,2], y=[3,4,4,1,1,7], val=1\n output: false\n*/\nbool xorContains(std::vector const& x, std::vector const& y, int val) {", + "outputs": [ + "\n const bool foundInX = std::find(x.begin(), x.end(), val) != x.end();\n const bool foundInY = std::find(y.begin(), y.end(), val) != y.end();\n\n return foundInX ^ foundInY;\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "search", + "language": "cpp", + "name": "37_search_find_the_closest_number_to_pi", + "parallelism_model": "serial", + "prompt": "/* Return the index of the value in the vector x that is closest to the math constant PI.\n Use M_PI for the value of PI.\n Example:\n\n input: [9.18, 3.05, 7.24, 11.3, -166.49, 2.1]\n output: 1\n*/\nsize_t findClosestToPi(std::vector const& x) {", + "outputs": [ + "\n size_t index = 0;\n double min = std::abs(x[0] - M_PI);\n for (size_t i = 1; i < x.size(); ++i) {\n double diff = std::abs(x[i] - M_PI);\n if (diff < min) {\n min = diff;\n index = i;\n }\n }\n return index;\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "search", + "language": "cpp", + "name": "38_search_find_the_first_even_number", + "parallelism_model": "serial", + "prompt": "/* Return the index of the first even number in the vector x.\n Examples:\n\n input: [7, 3, 9, 5, 5, 7, 2, 9, 12, 11]\n output: 6\n\n input: [3, 8, 9, 9, 3, 4, 8, 6]\n output: 1\n*/\nsize_t findFirstEven(std::vector const& x) {", + "outputs": [ + "\n for (size_t i = 0; i < x.size(); i += 1) {\n if (x[i] % 2 == 0) {\n return i;\n }\n }\n return x.size();\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "search", + "language": "cpp", + "name": "35_search_search_for_last_struct_by_key", + "parallelism_model": "serial", + "prompt": "struct Book {\n std::string title;\n int pages;\n};\n\n/* Return the index of the last Book item in the vector books where Book.pages is less than 100.\n\t Example:\n\n input: [{title=\"Green Eggs and Ham\", pages=72}, {title=\"gulliver's travels\", pages=362}, {title=\"Stories of Your Life\", pages=54}, {title=\"Hamilton\", pages=818}]\n output: 2\n*/\nsize_t findLastShortBook(std::vector const& books) {", + "outputs": [ + "\n for (int i = books.size() - 1; i >= 0; i--) {\n if (books[i].pages < 100) {\n return i;\n }\n }\n return books.size();\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "search", + "language": "cpp", + "name": "36_search_check_if_array_contains_value", + "parallelism_model": "serial", + "prompt": "/* Return true if the vector x contains the value `target`. Return false otherwise.\n Examples:\n\n input: x=[1, 8, 2, 6, 4, 6], target=3\n output: false\n \n input: x=[1, 8, 2, 6, 4, 6], target=8\n output: true\n*/\nbool contains(std::vector const& x, int target) {", + "outputs": [ + "\n return std::find(x.begin(), x.end(), target) != x.end();\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "fft", + "language": "cpp", + "name": "05_fft_inverse_fft", + "parallelism_model": "serial", + "prompt": "/* forward declare fft. computes fourier transform in-place */\nvoid fft(std::vector> &x);\n\n/* Compute the inverse fourier transform of x in-place.\n Example:\n \n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: [{0.5,0}, {0.125,0.301777}, {0,-0}, {0.125,0.0517767}, {0,-0}, {0.125,-0.0517767}, {0,-0}, {0.125,-0.301777}]\n*/\nvoid ifft(std::vector> &x) {", + "outputs": [ + "\n // conjugate the complex numbers\n std::transform(x.begin(), x.end(), x.begin(), [](auto const& val) { return std::conj(val); });\n\n // forward fft\n fft( x );\n\n // conjugate the complex numbers again\n std::transform(x.begin(), x.end(), x.begin(), [](auto const& val) { return std::conj(val); });\n\n // scale the numbers\n std::transform(x.begin(), x.end(), x.begin(), [&](std::complex c) { return c / static_cast(x.size()); });\n}\n\n#if defined(USE_CUDA)\n// a lot of model outputs assume this is defined for some reason, so just define it\n__device__ DOUBLE_COMPLEX_T cexp(DOUBLE_COMPLEX_T arg) {\n DOUBLE_COMPLEX_T res;\n float s, c;\n float e = expf(arg.x);\n sincosf(arg.y, &s, &c);\n res.x = c * e;\n res.y = s * e;\n return res;\n}\n\n__device__ DOUBLE_COMPLEX_T cuCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n\n__device__ DOUBLE_COMPLEX_T hipCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "fft", + "language": "cpp", + "name": "08_fft_split_fft", + "parallelism_model": "serial", + "prompt": "/* Compute the fourier transform of x. Store real part of results in r and imaginary in i.\n Example:\n\n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: r: [4, 1, 0, 1, 0, 1, 0, 1] i: [0, -2.41421, 0, -0.414214, 0, 0.414214, 0, 2.41421]\n*/\nvoid fft(std::vector> const& x, std::vector &r, std::vector &i) {", + "outputs": [ + "\n\tstd::vector> x_copy = x;\n\t// DFT\n\tunsigned int N = x_copy.size(), k = N, n;\n\tdouble thetaT = 3.14159265358979323846264338328L / N;\n\tstd::complex phiT = std::complex(std::cos(thetaT), -std::sin(thetaT)), T;\n\twhile (k > 1) {\n\t\tn = k;\n\t\tk >>= 1;\n\t\tphiT = phiT * phiT;\n\t\tT = 1.0L;\n\t\tfor (unsigned int l = 0; l < k; l++)\n\t\t{\n\t\t\tfor (unsigned int a = l; a < N; a += n)\n\t\t\t{\n\t\t\t\tunsigned int b = a + k;\n\t\t\t\tstd::complex t = x_copy[a] - x_copy[b];\n\t\t\t\tx_copy[a] += x_copy[b];\n\t\t\t\tx_copy[b] = t * T;\n\t\t\t}\n\t\t\tT *= phiT;\n\t\t}\n\t}\n\t// Decimate\n\tunsigned int m = (unsigned int)std::log2(N);\n\tfor (unsigned int a = 0; a < N; a++)\n\t{\n\t\tunsigned int b = a;\n\t\t// Reverse bits\n\t\tb = (((b & 0xaaaaaaaa) >> 1) | ((b & 0x55555555) << 1));\n\t\tb = (((b & 0xcccccccc) >> 2) | ((b & 0x33333333) << 2));\n\t\tb = (((b & 0xf0f0f0f0) >> 4) | ((b & 0x0f0f0f0f) << 4));\n\t\tb = (((b & 0xff00ff00) >> 8) | ((b & 0x00ff00ff) << 8));\n\t\tb = ((b >> 16) | (b << 16)) >> (32 - m);\n\t\tif (b > a)\n\t\t{\n\t\t\tstd::complex t = x_copy[a];\n\t\t\tx_copy[a] = x_copy[b];\n\t\t\tx_copy[b] = t;\n\t\t}\n\t}\n\n // split into real and imaginary parts\n for (size_t j = 0; j < x_copy.size(); j += 1) {\n r[j] = x_copy[j].real();\n i[j] = x_copy[j].imag();\n }\n}\n\nvoid fftCooleyTookey(std::vector> &x) {\n const size_t N = x.size();\n if (N <= 1) return;\n\n // divide\n std::vector> even = std::vector>(N/2);\n\tstd::vector> odd = std::vector>(N/2);\n\n\tfor (size_t j = 0; j < N/2; ++j) {\n\t\teven[j] = x[j*2];\n\t\todd[j] = x[j*2+1];\n\t}\n\n // conquer\n fftCooleyTookey(even);\n fftCooleyTookey(odd);\n\n // combine\n for (size_t k = 0; k < N/2; ++k) {\n std::complex t = std::polar(1.0, -2 * M_PI * k / N) * odd[k];\n x[k ] = even[k] + t;\n x[k+N/2] = even[k] - t;\n }\n}\n\n#if defined(USE_CUDA)\n// a lot of model outputs assume this is defined for some reason, so just define it\n__device__ DOUBLE_COMPLEX_T cexp(DOUBLE_COMPLEX_T arg) {\n DOUBLE_COMPLEX_T res;\n float s, c;\n float e = expf(arg.x);\n sincosf(arg.y, &s, &c);\n res.x = c * e;\n res.y = s * e;\n return res;\n}\n\n__device__ DOUBLE_COMPLEX_T cuCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n\n__device__ DOUBLE_COMPLEX_T hipCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "fft", + "language": "cpp", + "name": "07_fft_fft_conjugate", + "parallelism_model": "serial", + "prompt": "/* Compute the fourier transform of x in-place. Return the imaginary conjugate of each value.\n Example:\n\n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: [{4,0}, {1,-2.41421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fftConjugate(std::vector> &x) {", + "outputs": [ + "\n\t// DFT\n\tunsigned int N = x.size(), k = N, n;\n\tdouble thetaT = 3.14159265358979323846264338328L / N;\n\tstd::complex phiT = std::complex(std::cos(thetaT), -std::sin(thetaT)), T;\n\twhile (k > 1) {\n\t\tn = k;\n\t\tk >>= 1;\n\t\tphiT = phiT * phiT;\n\t\tT = 1.0L;\n\t\tfor (unsigned int l = 0; l < k; l++) {\n\t\t\tfor (unsigned int a = l; a < N; a += n) {\n\t\t\t\tunsigned int b = a + k;\n\t\t\t\tstd::complex t = x[a] - x[b];\n\t\t\t\tx[a] += x[b];\n\t\t\t\tx[b] = t * T;\n\t\t\t}\n\t\t\tT *= phiT;\n\t\t}\n\t}\n\t// Decimate\n\tunsigned int m = (unsigned int)std::log2(N);\n\tfor (unsigned int a = 0; a < N; a++)\n\t{\n\t\tunsigned int b = a;\n\t\t// Reverse bits\n\t\tb = (((b & 0xaaaaaaaa) >> 1) | ((b & 0x55555555) << 1));\n\t\tb = (((b & 0xcccccccc) >> 2) | ((b & 0x33333333) << 2));\n\t\tb = (((b & 0xf0f0f0f0) >> 4) | ((b & 0x0f0f0f0f) << 4));\n\t\tb = (((b & 0xff00ff00) >> 8) | ((b & 0x00ff00ff) << 8));\n\t\tb = ((b >> 16) | (b << 16)) >> (32 - m);\n\t\tif (b > a)\n\t\t{\n\t\t\tstd::complex t = x[a];\n\t\t\tx[a] = x[b];\n\t\t\tx[b] = t;\n\t\t}\n\t}\n\n\t// conjugate\n\tfor (size_t i = 0; i < x.size(); i += 1) {\n\t\tx[i] = std::conj(x[i]);\n\t}\n}\n\nvoid fftCooleyTookey(std::vector>& x) {\n const size_t N = x.size();\n if (N <= 1) return;\n\n // divide\n std::vector> even = std::vector>(N/2);\n\tstd::vector> odd = std::vector>(N/2);\n\n\tfor (size_t i = 0; i < N/2; ++i) {\n\t\teven[i] = x[i*2];\n\t\todd[i] = x[i*2+1];\n\t}\n\n // conquer\n fftCooleyTookey(even);\n fftCooleyTookey(odd);\n\n // combine\n for (size_t k = 0; k < N/2; ++k) {\n std::complex t = std::polar(1.0, -2 * M_PI * k / N) * odd[k];\n x[k ] = even[k] + t;\n x[k+N/2] = even[k] - t;\n }\n\n\t// conjugate\n\tfor (size_t i = 0; i < x.size(); i += 1) {\n\t\tx[i] = std::conj(x[i]);\n\t}\n}\n\n#if defined(USE_CUDA)\n// a lot of model outputs assume this is defined for some reason, so just define it\n__device__ DOUBLE_COMPLEX_T cexp(DOUBLE_COMPLEX_T arg) {\n DOUBLE_COMPLEX_T res;\n float s, c;\n float e = expf(arg.x);\n sincosf(arg.y, &s, &c);\n res.x = c * e;\n res.y = s * e;\n return res;\n}\n\n__device__ DOUBLE_COMPLEX_T cuCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n\n__device__ DOUBLE_COMPLEX_T hipCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "fft", + "language": "cpp", + "name": "09_fft_fft_out_of_place", + "parallelism_model": "serial", + "prompt": "/* Compute the fourier transform of x. Store the result in output.\n Example:\n\n input: [1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]\n output: [{4,0}, {1,-2.42421}, {0,0}, {1,-0.414214}, {0,0}, {1,0.414214}, {0,0}, {1,2.41421}]\n*/\nvoid fft(std::vector> const& x, std::vector> &output) {", + "outputs": [ + "\n output = x;\n\n // DFT\n\tunsigned int N = output.size(), k = N, n;\n\tdouble thetaT = 3.14159265358979323846264338328L / N;\n\tstd::complex phiT = std::complex(std::cos(thetaT), -std::sin(thetaT)), T;\n\twhile (k > 1) {\n\t\tn = k;\n\t\tk >>= 1;\n\t\tphiT = phiT * phiT;\n\t\tT = 1.0L;\n\t\tfor (unsigned int l = 0; l < k; l++)\n\t\t{\n\t\t\tfor (unsigned int a = l; a < N; a += n)\n\t\t\t{\n\t\t\t\tunsigned int b = a + k;\n\t\t\t\tstd::complex t = output[a] - output[b];\n\t\t\t\toutput[a] += output[b];\n\t\t\t\toutput[b] = t * T;\n\t\t\t}\n\t\t\tT *= phiT;\n\t\t}\n\t}\n\t// Decimate\n\tunsigned int m = (unsigned int)std::log2(N);\n\tfor (unsigned int a = 0; a < N; a++) {\n\t\tunsigned int b = a;\n\t\t// Reverse bits\n\t\tb = (((b & 0xaaaaaaaa) >> 1) | ((b & 0x55555555) << 1));\n\t\tb = (((b & 0xcccccccc) >> 2) | ((b & 0x33333333) << 2));\n\t\tb = (((b & 0xf0f0f0f0) >> 4) | ((b & 0x0f0f0f0f) << 4));\n\t\tb = (((b & 0xff00ff00) >> 8) | ((b & 0x00ff00ff) << 8));\n\t\tb = ((b >> 16) | (b << 16)) >> (32 - m);\n\t\tif (b > a) {\n\t\t\tstd::complex t = output[a];\n\t\t\toutput[a] = output[b];\n\t\t\toutput[b] = t;\n\t\t}\n\t}\n}\n\nvoid fftCooleyTookey(std::vector> &x) {\n const size_t N = x.size();\n if (N <= 1) return;\n\n // divide\n std::vector> even = std::vector>(N/2);\n\tstd::vector> odd = std::vector>(N/2);\n\n\tfor (size_t j = 0; j < N/2; ++j) {\n\t\teven[j] = x[j*2];\n\t\todd[j] = x[j*2+1];\n\t}\n\n // conquer\n fftCooleyTookey(even);\n fftCooleyTookey(odd);\n\n // combine\n for (size_t k = 0; k < N/2; ++k) {\n std::complex t = std::polar(1.0, -2 * M_PI * k / N) * odd[k];\n x[k ] = even[k] + t;\n x[k+N/2] = even[k] - t;\n }\n}\n\n#if defined(USE_CUDA)\n// a lot of model outputs assume this is defined for some reason, so just define it\n__device__ DOUBLE_COMPLEX_T cexp(DOUBLE_COMPLEX_T arg) {\n DOUBLE_COMPLEX_T res;\n float s, c;\n float e = expf(arg.x);\n sincosf(arg.y, &s, &c);\n res.x = c * e;\n res.y = s * e;\n return res;\n}\n\n__device__ DOUBLE_COMPLEX_T cuCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n\n__device__ DOUBLE_COMPLEX_T hipCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "fft", + "language": "cpp", + "name": "06_fft_dft", + "parallelism_model": "serial", + "prompt": "/* Compute the discrete fourier transform of x. Store the result in output.\n Example:\n\n input: [1, 4, 9, 16]\n output: [30+0i, -8-12i, -10-0i, -8+12i]\n*/\nvoid dft(std::vector const& x, std::vector> &output) {", + "outputs": [ + "\n int N = x.size();\n output.resize(N, std::complex(0, 0)); // Resize the output vector and initialize with 0\n\n for (int k = 0; k < N; k++) { // For each output element\n std::complex sum(0, 0);\n for (int n = 0; n < N; n++) { // For each input element\n double angle = 2 * M_PI * n * k / N;\n std::complex c(std::cos(angle), -std::sin(angle)); // Euler's formula\n sum += x[n] * c;\n }\n output[k] = sum;\n }\n}\n\n\n#if defined(USE_CUDA)\n// a lot of model outputs assume this is defined for some reason, so just define it\n__device__ DOUBLE_COMPLEX_T cexp(DOUBLE_COMPLEX_T arg) {\n DOUBLE_COMPLEX_T res;\n float s, c;\n float e = expf(arg.x);\n sincosf(arg.y, &s, &c);\n res.x = c * e;\n res.y = s * e;\n return res;\n}\n\n__device__ DOUBLE_COMPLEX_T cuCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n\n__device__ DOUBLE_COMPLEX_T hipCexp(DOUBLE_COMPLEX_T arg) {\n return cexp(arg);\n}\n#endif", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "scan", + "language": "cpp", + "name": "33_scan_reverse_prefix_sum", + "parallelism_model": "serial", + "prompt": "/* Compute the reverse prefix sum of the vector x into output.\n Examples:\n \n input: [1, 7, 4, 6, 6, 2]\n output: [2, 8, 14, 18, 25, 26]\n\n input: [3, 3, 7, 1, -2]\n output: [-2, -1, 6, 9, 12]\n*/\nvoid reversePrefixSum(std::vector const& x, std::vector &output) {", + "outputs": [ + "\n std::vector reverseX;\n for (int i = x.size() - 1; i >= 0; i--) {\n reverseX.push_back(x[i]);\n }\n std::inclusive_scan(reverseX.begin(), reverseX.end(), output.begin());\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "scan", + "language": "cpp", + "name": "30_scan_prefix_sum", + "parallelism_model": "serial", + "prompt": "/* Compute the prefix sum of the vector x into output.\n Example:\n \n input: [1, 7, 4, 6, 6, 2]\n output: [1, 8, 12, 18, 24, 26]\n*/\nvoid prefixSum(std::vector const& x, std::vector &output) {", + "outputs": [ + "\n std::inclusive_scan(x.begin(), x.end(), output.begin());\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "scan", + "language": "cpp", + "name": "34_scan_largest_contiguous_subarray_sum", + "parallelism_model": "serial", + "prompt": "/* Return the largest sum of any contiguous subarray in the vector x.\n i.e. if x=[\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4] then [4, \u22121, 2, 1] is the contiguous\n subarray with the largest sum of 6.\n Example:\n\n input: [\u22122, 1, \u22123, 4, \u22121, 2, 1, \u22125, 4]\n output: 6\n*/\nint maximumSubarray(std::vector const& x) {", + "outputs": [ + "\n int largestSum = std::numeric_limits::lowest();\n for (int i = 0; i < x.size(); i++) {\n int currSum = 0;\n for (int j = i; j < x.size(); j++) {\n currSum += x[j];\n if (currSum > largestSum) largestSum = currSum;\n }\n }\n return largestSum;\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "scan", + "language": "cpp", + "name": "32_scan_sum_of_prefix_sum_array", + "parallelism_model": "serial", + "prompt": "/* Compute the prefix sum array of the vector x and return its sum.\n Example:\n\n input: [-7, 2, 1, 9, 4, 8]\n output: 15\n*/\ndouble sumOfPrefixSum(std::vector const& x) {", + "outputs": [ + "\n std::vector prefixSum(x.size());\n std::inclusive_scan(x.begin(), x.end(), prefixSum.begin());\n return std::accumulate(prefixSum.begin(), prefixSum.end(), 0.0);\n}\n", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "scan", + "language": "cpp", + "name": "31_scan_scan_with_min_function", + "parallelism_model": "serial", + "prompt": "/* Replace the i-th element of the vector x with the minimum value from indices 0 through i.\n Examples:\n\n input: [8, 6, -1, 7, 3, 4, 4]\n output: [8, 6, -1, -1, -1, -1, -1]\n\n input: [5, 4, 6, 4, 3, 6, 1, 1]\n output: [5, 4, 4, 4, 3, 3, 1, 1]\n*/\nvoid partialMinimums(std::vector &x) {", + "outputs": [ + "\n std::inclusive_scan(x.begin(), x.end(), x.begin(), [] (const float &x, const float &y) {\n return std::min(x, y);\n },\n std::numeric_limits::max());\n}\n", + " }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "graph", + "language": "cpp", + "name": "17_graph_highest_degree", + "parallelism_model": "serial", + "prompt": "/* Compute the highest node degree in the undirected graph. The graph is defined in the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n output: 3\n*/\nint maxDegree(std::vector const& A, size_t N) {", + "outputs": [ + "\n int maxDegree = 0;\n for (int i = 0; i < N; i += 1) {\n int degree = 0;\n for (int j = 0; j < N; j += 1) {\n degree += A[i * N + j];\n }\n maxDegree = std::max(maxDegree, degree);\n }\n return maxDegree;\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "graph", + "language": "cpp", + "name": "16_graph_largest_component", + "parallelism_model": "serial", + "prompt": "/* Return the number of vertices in the largest component of the undirected graph defined by the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n output: 2\n*/\nint largestComponent(std::vector const& A, size_t N) {", + "outputs": [ + "\n std::vector visited(N, false);\n int maxCount = 0;\n for (int i = 0; i < N; i += 1) {\n if (!visited[i]) {\n int count = 0;\n dfs(A, i, N, visited, count);\n maxCount = std::max(maxCount, count);\n }\n }\n return maxCount;\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "graph", + "language": "cpp", + "name": "19_graph_shortest_path", + "parallelism_model": "serial", + "prompt": "/* Return the length of the shortest path from source to dest in the undirected graph defined by the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n Example:\n\n\t input: [[0, 1, 1, 0], [1, 0, 0, 1], [1, 0, 0, 1], [0, 1, 1, 0]], source=0, dest=3\n output: 2\n*/\nint shortestPathLength(std::vector const& A, size_t N, int source, int dest) {", + "outputs": [ + "\n std::vector visited(N, false);\n std::queue> queue;\n visited[source] = true;\n queue.push({source, 0});\n\n while (!queue.empty()) {\n auto [current, pathLength] = queue.front();\n queue.pop();\n\n if (current == dest) {\n return pathLength;\n }\n\n // Check all adjacent vertices\n for (int i = 0; i < N; ++i) {\n if (A[current*N + i] && !visited[i]) {\n visited[i] = true;\n queue.push({i, pathLength + 1});\n }\n }\n }\n\n return std::numeric_limits::max();\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "graph", + "language": "cpp", + "name": "15_graph_edge_count", + "parallelism_model": "serial", + "prompt": "/* Count the number of edges in the directed graph defined by the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A represents a directed graph.\n Example:\n\n\t input: [[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [1, 1, 1, 0]]\n output: 6\n*/\nint edgeCount(std::vector const& A, size_t N) {", + "outputs": [ + "\n int count = 0;\n for (int i = 0; i < N; i += 1) {\n for (int j = 0; j < N; j += 1) {\n if (A[i * N + j] == 1) {\n count += 1;\n }\n }\n }\n return count;\n}", + " return 0; }", + " undefinedFunction(); }" + ] + }, + { + "problem_type": "graph", + "language": "cpp", + "name": "18_graph_count_components", + "parallelism_model": "serial", + "prompt": "/* Count the number of connected components in the undirected graph defined by the adjacency matrix A.\n A is an NxN adjacency matrix stored in row-major. A is an undirected graph.\n Example:\n\n\t input: [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0]]\n output: 2\n*/\nint componentCount(std::vector const& A, size_t N) {", + "outputs": [ + "\n std::vector visited(N, false);\n int count = 0;\n for (int i = 0; i < N; i += 1) {\n if (!visited[i]) {\n dfs(A, i, N, visited);\n count += 1;\n }\n }\n return count;\n}", + " return 0; }", + " undefinedFunction(); }" + ] } ] \ No newline at end of file diff --git a/generate/generate-vllm.py b/generate/generate-vllm.py new file mode 100644 index 0000000..8c42172 --- /dev/null +++ b/generate/generate-vllm.py @@ -0,0 +1,161 @@ +# std imports +import argparse +import json +import os +import sys +import time +from tqdm import tqdm +import torch + +# tpl imports +from vllm import LLM, SamplingParams + +# local imports +from utils import BalancedBracketsCriteria, PromptDataset, clean_output, get_inference_config + +""" Parse command line arguments """ +parser = argparse.ArgumentParser(description='Generate code with vLLM') +parser.add_argument('--prompts', required=True, help='Path to the prompt JSON file') +parser.add_argument('--model', required=True, help='Path to the language model') +parser.add_argument('--output', required=True, help='Path to the output JSON file') +parser.add_argument('--restart', action='store_true', help='Restart generation from scratch (default: False)') +parser.add_argument('--cache', help='JSONL file to cache intermediate results in. Will be restored from if it ' + + 'already exists and --restart is not specified') +parser.add_argument('--restore_from', help='JSON file to restore old results from. Will be restored from ' + + 'if it already exists and --restart is not specified. Is different from --cache in that it is a JSON file, not a ' + + 'JSONL file, and it is only used to restore old results where the prompt is equivalent. Cached results are ' + + 'prioritized over restored results.') +parser.add_argument('--max_new_tokens', type=int, default=1024, help='Maximum number of new tokens to generate (default: 1024)') +parser.add_argument('--num_samples_per_prompt', type=int, default=50, help='Number of code samples to generate (default: 50)') +parser.add_argument('--temperature', type=float, default=0.2, help='Temperature for controlling randomness (default: 0.2)') +parser.add_argument('--top_p', type=float, default=0.95, help='Top p value for nucleus sampling (default: 0.95)') +parser.add_argument('--do_sample', action='store_true', help='Enable sampling (default: False)') +parser.add_argument('--prompted', action='store_true', help='Use prompted generation. See StarCoder paper (default: False)') +args = parser.parse_args() + +""" Load prompts """ +with open(args.prompts, 'r') as json_file: + prompts = json.load(json_file) + +""" Load existing responses if they exist """ +if not args.restart and os.path.exists(args.cache): + with open(args.cache, 'r') as jsonl_file: + responses = [json.loads(line) for line in jsonl_file] + + # remove prompt from prompts if it is in responses and has an 'output' value with at least 1 entry + original_len = len(prompts) + prompts = [p for p in prompts if + not any(p["name"] == r["name"] and + p["parallelism_model"] == r["parallelism_model"] and + p["prompt"] == r["prompt"] and + args.temperature == r["temperature"] and + args.prompted == r["prompted"] and + args.num_samples_per_prompt == len(r["outputs"]) + for r in responses)] + print(f"[cache] Skipping {original_len - len(prompts)} prompts that already have responses") + +""" Load existing responses if they exist """ +if not args.restart and args.restore_from and os.path.exists(args.restore_from): + with open(args.restore_from, 'r') as json_file: + restored_responses = json.load(json_file) + + # remove prompt from prompts if it is in responses and has an 'output' value with at least 1 entry + original_len = len(prompts) + responses_to_keep = [] + prompts_without_existing_responses = [] + for p in prompts: + for r in restored_responses: + if p["name"] == r["name"] and \ + p["parallelism_model"] == r["parallelism_model"] and \ + p["prompt"] == r["prompt"] and \ + args.temperature == r["temperature"] and \ + args.prompted == r["prompted"] and \ + args.num_samples_per_prompt == len(r["outputs"]): + responses_to_keep.append(r) + break + else: + prompts_without_existing_responses.append(p) + prompts = prompts_without_existing_responses + print(f"[restore_from] Skipping {original_len - len(prompts)} prompts that already have responses. " + + f"{len(prompts)} prompts left.") + + # write restored responses to cache + if args.cache is not None: + with open(args.cache, 'a') as jsonl_file: + for response in responses_to_keep: + jsonl_file.write(json.dumps(response) + "\n") + print(f"[restore_from] Wrote {len(responses_to_keep)} restored responses to cache") + +""" Initialize inference config """ +inference_config = get_inference_config(args.model, prompted=args.prompted) + +prompts_repeated = [p for p in prompts for _ in range(args.num_samples_per_prompt)] + +""" Initialize vLLM engine """ +llm = LLM(model=args.model, tensor_parallel_size=torch.cuda.device_count()) + +# Configure sampling parameters +sampling_params = SamplingParams( + temperature=args.temperature if args.do_sample else 0, + top_p=args.top_p if args.do_sample else 1.0, + max_tokens=args.max_new_tokens, + n=1, # We handle multiple samples manually +) + +""" Generate code """ +if not args.restart and args.cache is not None and os.path.exists(args.cache): + with open(args.cache, 'r') as jsonl_file: + responses = [json.loads(line) for line in jsonl_file] + responses = [r for r in responses if r["temperature"] == args.temperature and r["prompted"] == args.prompted + and args.num_samples_per_prompt == len(r["outputs"]) + and any(p["name"] == r["name"] and p["prompt"] == r["prompt"] and p["parallelism_model"] == r["parallelism_model"] for p in prompts)] +else: + responses = [] + +cur_prompt = None +start_time = time.time() +total_tokens = 0 + +# Format all prompts +formatted_prompts = [inference_config.format_prompt(p["prompt"]) for p in prompts_repeated] + +# Generate all outputs at once +outputs = llm.generate(formatted_prompts, sampling_params) + +# Process outputs +for idx, (prompt, output) in enumerate(zip(prompts_repeated, outputs)): + if idx % args.num_samples_per_prompt == 0: + cur_prompt = prompt.copy() + cur_prompt.update({ + "temperature": args.temperature, + "top_p": args.top_p, + "do_sample": args.do_sample, + "max_new_tokens": args.max_new_tokens, + "prompted": args.prompted + }) + cur_prompt["outputs"] = [] + cur_prompt["raw_outputs"] = [] + prompt_str = cur_prompt["prompt"] + + # Count tokens and clean output + # FIXME: This is to keep the same behavior as generate.py + huggingface_style_output = output.prompt + output.outputs[0].text + total_tokens += len(llm.get_tokenizer().encode(huggingface_style_output)) + cleaned_output = inference_config.clean_output(huggingface_style_output, prompt_str) + cur_prompt["outputs"].append(cleaned_output) + cur_prompt["raw_outputs"].append(huggingface_style_output) + + if idx % args.num_samples_per_prompt == args.num_samples_per_prompt - 1: + responses.append(cur_prompt) + + if not args.restart and args.cache is not None: + with open(args.cache, 'a') as jsonl_file: + jsonl_file.write(json.dumps(cur_prompt) + "\n") + +end_time = time.time() +tokens_per_second = total_tokens / (end_time - start_time) +print(f"Generated {len(responses)} code samples in {end_time - start_time:.2f} seconds ({tokens_per_second:.2f} tokens per second)") + +""" Save responses to JSON file """ +with open(args.output, 'w') as output_file: + json.dump(responses, output_file, indent=4) \ No newline at end of file diff --git a/generate/utils.py b/generate/utils.py index e05ba98..144864f 100644 --- a/generate/utils.py +++ b/generate/utils.py @@ -75,7 +75,7 @@ def clean_instruct_output(output: str, prompt: str, response_tag: str) -> str: # 0. replace up to the end of the first instance of prompt prompt_loc = output.find(response_tag) if prompt_loc == -1: - raise ValueError(f"Prompt not found in output: {prompt}") + raise ValueError(f"Response tag {response_tag} not found in output: {prompt}") output = output[prompt_loc + len(response_tag):].strip() # 1. Find all code blocks enclosed in triple backticks with "c++" language tag @@ -370,7 +370,6 @@ def clean_output(self, output: str, prompt: str) -> str: class InstructConfig(InferenceConfig): - def __init__(self, prompted : bool = False, instruction_tag : str = "### Instruction", response_tag : str = "### Response"): super().__init__(prompted=prompted) self.instruction_tag = instruction_tag @@ -401,6 +400,63 @@ def format_prompt(self, prompt : str) -> str: def clean_output(self, output: str, prompt: str) -> str: return clean_instruct_output(output, prompt, self.response_tag) +class QwenConfig(InferenceConfig): + def __init__(self, prompted : bool = False): + super().__init__(prompted=prompted) + + def get_dtype(self): + return torch.float16 + + def init_padding(self, tokenizer): + tokenizer.pad_token_id = tokenizer.eos_token_id # for batching + tokenizer.padding_side = "left" # for decoder-only models + + def get_pad_token_id(self, tokenizer) -> int: + return tokenizer.eos_token_id + + def get_eos_token_id(self, tokenizer) -> int: + return None + + def trust_remote_code(self) -> bool: + return False + + def format_prompt(self, prompt : str) -> str: + if self.prompted: + return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}" + return prompt.strip() + + def clean_output(self, output: str, prompt: str) -> str: + return clean_output(output, prompt) + +class ChatMLConfig(InferenceConfig): + def __init__(self, prompted : bool = False): + super().__init__(prompted=prompted) + + def get_dtype(self): + return torch.bfloat16 + + def init_padding(self, tokenizer): + tokenizer.pad_token_id = tokenizer.eos_token_id # for batching + tokenizer.padding_side = "left" # for decoder-only models + + def get_pad_token_id(self, tokenizer) -> int: + return tokenizer.pad_token_id + + def get_eos_token_id(self, tokenizer) -> int: + return tokenizer.eos_token_id + + def trust_remote_code(self) -> bool: + return False + + def format_prompt(self, prompt : str) -> str: + function_name = get_function_name(prompt, "cuda" if "__global__" in prompt else "serial") + prompt = f"Complete the following c++ function.\n```c++{prompt.strip()}```\nWrite only the function {function_name} and no other code. Enclose your solution in ```c++ and ```." + prompt = f"<|im_start|>system\nYou are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" + return prompt + + def clean_output(self, output: str, prompt: str) -> str: + return clean_instruct_output(output, prompt,"<|im_start|>assistant\n") + def get_inference_config(model_name : str, **kwargs) -> InferenceConfig: if model_name == "bigcode/starcoderbase": return StarCoderConfig(**kwargs) @@ -422,6 +478,12 @@ def get_inference_config(model_name : str, **kwargs) -> InferenceConfig: return InstructConfig(instruction_tag='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:', response_tag='### Response:', **kwargs) elif model_name.startswith('hpcgroup/rlpf'): return InstructConfig(instruction_tag='### Instruction', response_tag='### Response', **kwargs) + elif model_name.startswith('Qwen/Qwen2.5') and 'Instruct' in model_name: + return ChatMLConfig(**kwargs) + elif model_name.startswith('Qwen/Qwen3'): + return ChatMLConfig(**kwargs) + elif model_name.startswith('Qwen/Qwen2.5'): + return QwenConfig(**kwargs) else: raise ValueError(f"Unknown model name: {model_name}")