diff --git a/pipeline/.gitignore b/pipeline/.gitignore new file mode 100644 index 0000000..1521c8b --- /dev/null +++ b/pipeline/.gitignore @@ -0,0 +1 @@ +dist diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt new file mode 100644 index 0000000..ea536e0 --- /dev/null +++ b/pipeline/CMakeLists.txt @@ -0,0 +1,105 @@ +cmake_minimum_required(VERSION 3.20) +project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX) + +cmake_policy(SET CMP0167 NEW) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +find_package(CURL REQUIRED) +find_package(Boost REQUIRED COMPONENTS unit_test_framework) + +include(FetchContent) + +FetchContent_Declare( + nlohmann_json + GIT_REPOSITORY https://github.com/nlohmann/json.git + GIT_TAG v3.11.3 +) +FetchContent_MakeAvailable(nlohmann_json) + +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS + src/*.cpp + src/*.h +) + +add_executable(biergarten-pipeline ${SOURCES}) + +target_include_directories(biergarten-pipeline + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +target_link_libraries(biergarten-pipeline + PRIVATE + CURL::libcurl + nlohmann_json::nlohmann_json + Boost::unit_test_framework +) + +target_compile_options(biergarten-pipeline PRIVATE + $<$: + -Wall + -Wextra + -Wpedantic + -Wshadow + -Wconversion + -Wsign-conversion + > + $<$: + /W4 + /WX + > +) + +add_custom_command(TARGET biergarten-pipeline POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${CMAKE_CURRENT_SOURCE_DIR}/output + COMMENT "Creating output/ directory for seed SQL files" +) + +find_program(VALGRIND valgrind) +if(VALGRIND) + add_custom_target(memcheck + COMMAND ${VALGRIND} + --leak-check=full + --error-exitcode=1 + $ --help + DEPENDS biergarten-pipeline + COMMENT "Running Valgrind memcheck" + ) +endif() + +include(CTest) + +if(BUILD_TESTING) + find_package(Boost REQUIRED COMPONENTS unit_test_framework) + + file(GLOB_RECURSE TEST_SOURCES CONFIGURE_DEPENDS + tests/*.cpp + tests/*.cc + tests/*.cxx + ) + + if(TEST_SOURCES) + add_executable(biergarten-pipeline-tests ${TEST_SOURCES}) + + target_include_directories(biergarten-pipeline-tests + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include + ) + + target_link_libraries(biergarten-pipeline-tests + PRIVATE + Boost::unit_test_framework + CURL::libcurl + nlohmann_json::nlohmann_json + ) + + add_test( + NAME biergarten-pipeline-tests + COMMAND biergarten-pipeline-tests + ) + endif() +endif() diff --git a/pipeline/includes/header.hpp b/pipeline/includes/header.hpp new file mode 100644 index 0000000..e11d098 --- /dev/null +++ b/pipeline/includes/header.hpp @@ -0,0 +1,5 @@ +#pragma once + +int add(int a, int b) { + return a + b; +} diff --git a/pipeline/raw-data/fetch-breweries.sh b/pipeline/raw-data/fetch-breweries.sh new file mode 100755 index 0000000..95553ba --- /dev/null +++ b/pipeline/raw-data/fetch-breweries.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Fetch breweries data from OpenBreweryDB API and save to JSON files. +# Saves results to misc/raw-data/breweries-complete.json + +OUTPUT_DIR="misc/raw-data" +API_BASE="https://api.openbrewerydb.org/v1/breweries" + +mkdir -p "$OUTPUT_DIR" + +echo "Fetching breweries from OpenBreweryDB API..." +echo "[]" > "$OUTPUT_FILE" + +total_count=0 + +for page in {1..30}; do + echo "Fetching page $page..." + + curl -s "$API_BASE?per_page=200&page=$page" | \ + jq '.' > "$OUTPUT_DIR/page-$page.json" + + count=$(jq 'length' "$OUTPUT_DIR/page-$page.json") + total_count=$((total_count + count)) + echo " Got $count breweries (total: $total_count)" +done + +echo "Done fetching. Total breweries fetched: $total_count" diff --git a/pipeline/src/main.cpp b/pipeline/src/main.cpp new file mode 100644 index 0000000..11fb063 --- /dev/null +++ b/pipeline/src/main.cpp @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include +#include +#include +# + +namespace fs = std::filesystem; + +struct GlobalCurl { + GlobalCurl() { + if (curl_global_init(CURL_GLOBAL_DEFAULT) != 0) + throw std::runtime_error("Failed to initialize libcurl"); + } + ~GlobalCurl() { curl_global_cleanup(); } + + GlobalCurl(const GlobalCurl &) = delete; + GlobalCurl &operator=(const GlobalCurl &) = delete; +}; + + +// CURL writes data in chunks — this callback appends each chunk to a string +static size_t writeCallback(char *ptr, size_t size, size_t nmemb, std::string *out) { + out->append(ptr, size * nmemb); + return size * nmemb; +} + +int main(int argc, char **argv) { + int total_count = 0; + + fs::create_directories("output"); + + GlobalCurl curl_guard; + + struct PageResult { + int page; + int count; + std::string error; + }; + + std::vector> jobs; + jobs.reserve(30); + + for (int page = 1; page <= 30; ++page) { + jobs.emplace_back(std::async(std::launch::async, [page]() -> PageResult { + PageResult result{page, 0, ""}; + + CURL *curl = curl_easy_init(); + if (!curl) { + result.error = "Failed to initialize CURL"; + return result; + } + + std::string response; + std::string api_url = + "https://api.openbrewerydb.org/v1/breweries?per_page=200&page=" + std::to_string(page); + + curl_easy_setopt(curl, CURLOPT_URL, api_url.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); + + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + result.error = curl_easy_strerror(res); + curl_easy_cleanup(curl); + return result; + } + + try { + nlohmann::json breweries = nlohmann::json::parse(response); + result.count = static_cast(breweries.size()); + + if (result.count > 0) { + std::string out_path = "output/page-" + std::to_string(page) + ".json"; + std::ofstream out_file(out_path); + out_file << breweries.dump(2); + } + } catch (const std::exception &ex) { + result.error = ex.what(); + } + + curl_easy_cleanup(curl); + return result; + })); + } + + for (auto &job : jobs) { + PageResult r = job.get(); + + std::cout << "Fetching page " << r.page << "..." << std::endl; + + if (!r.error.empty()) { + std::cerr << "Error on page " << r.page << ": " << r.error << std::endl; + curl_global_cleanup(); + return 1; + } + + total_count += r.count; + std::cout << " Got " << r.count << " breweries (total: " << total_count << ")" << std::endl; + + if (r.count == 0) break; + } + + curl_global_cleanup(); + return 0; +}