Add pipeline CMake, source, and fetch script

This commit is contained in:
Aaron Po
2026-03-24 02:10:21 -04:00
parent 581863d69b
commit ad1adfeb62
5 changed files with 247 additions and 0 deletions

1
pipeline/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
dist

105
pipeline/CMakeLists.txt Normal file
View File

@@ -0,0 +1,105 @@
cmake_minimum_required(VERSION 3.20)
project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX)
cmake_policy(SET CMP0167 NEW)
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
find_package(CURL REQUIRED)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
include(FetchContent)
FetchContent_Declare(
nlohmann_json
GIT_REPOSITORY https://github.com/nlohmann/json.git
GIT_TAG v3.11.3
)
FetchContent_MakeAvailable(nlohmann_json)
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
src/*.cpp
src/*.h
)
add_executable(biergarten-pipeline ${SOURCES})
target_include_directories(biergarten-pipeline
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(biergarten-pipeline
PRIVATE
CURL::libcurl
nlohmann_json::nlohmann_json
Boost::unit_test_framework
)
target_compile_options(biergarten-pipeline PRIVATE
$<$<CXX_COMPILER_ID:GNU,Clang>:
-Wall
-Wextra
-Wpedantic
-Wshadow
-Wconversion
-Wsign-conversion
>
$<$<CXX_COMPILER_ID:MSVC>:
/W4
/WX
>
)
add_custom_command(TARGET biergarten-pipeline POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${CMAKE_CURRENT_SOURCE_DIR}/output
COMMENT "Creating output/ directory for seed SQL files"
)
find_program(VALGRIND valgrind)
if(VALGRIND)
add_custom_target(memcheck
COMMAND ${VALGRIND}
--leak-check=full
--error-exitcode=1
$<TARGET_FILE:biergarten-pipeline> --help
DEPENDS biergarten-pipeline
COMMENT "Running Valgrind memcheck"
)
endif()
include(CTest)
if(BUILD_TESTING)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
file(GLOB_RECURSE TEST_SOURCES CONFIGURE_DEPENDS
tests/*.cpp
tests/*.cc
tests/*.cxx
)
if(TEST_SOURCES)
add_executable(biergarten-pipeline-tests ${TEST_SOURCES})
target_include_directories(biergarten-pipeline-tests
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(biergarten-pipeline-tests
PRIVATE
Boost::unit_test_framework
CURL::libcurl
nlohmann_json::nlohmann_json
)
add_test(
NAME biergarten-pipeline-tests
COMMAND biergarten-pipeline-tests
)
endif()
endif()

View File

@@ -0,0 +1,5 @@
#pragma once
int add(int a, int b) {
return a + b;
}

View File

@@ -0,0 +1,27 @@
#!/bin/bash
# Fetch breweries data from OpenBreweryDB API and save to JSON files.
# Saves results to misc/raw-data/breweries-complete.json
OUTPUT_DIR="misc/raw-data"
API_BASE="https://api.openbrewerydb.org/v1/breweries"
mkdir -p "$OUTPUT_DIR"
echo "Fetching breweries from OpenBreweryDB API..."
echo "[]" > "$OUTPUT_FILE"
total_count=0
for page in {1..30}; do
echo "Fetching page $page..."
curl -s "$API_BASE?per_page=200&page=$page" | \
jq '.' > "$OUTPUT_DIR/page-$page.json"
count=$(jq 'length' "$OUTPUT_DIR/page-$page.json")
total_count=$((total_count + count))
echo " Got $count breweries (total: $total_count)"
done
echo "Done fetching. Total breweries fetched: $total_count"

109
pipeline/src/main.cpp Normal file
View File

@@ -0,0 +1,109 @@
#include <curl/curl.h>
#include <nlohmann/json.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include <filesystem>
#include <vector>
#include <future>
#
namespace fs = std::filesystem;
struct GlobalCurl {
GlobalCurl() {
if (curl_global_init(CURL_GLOBAL_DEFAULT) != 0)
throw std::runtime_error("Failed to initialize libcurl");
}
~GlobalCurl() { curl_global_cleanup(); }
GlobalCurl(const GlobalCurl &) = delete;
GlobalCurl &operator=(const GlobalCurl &) = delete;
};
// CURL writes data in chunks — this callback appends each chunk to a string
static size_t writeCallback(char *ptr, size_t size, size_t nmemb, std::string *out) {
out->append(ptr, size * nmemb);
return size * nmemb;
}
int main(int argc, char **argv) {
int total_count = 0;
fs::create_directories("output");
GlobalCurl curl_guard;
struct PageResult {
int page;
int count;
std::string error;
};
std::vector<std::future<PageResult>> jobs;
jobs.reserve(30);
for (int page = 1; page <= 30; ++page) {
jobs.emplace_back(std::async(std::launch::async, [page]() -> PageResult {
PageResult result{page, 0, ""};
CURL *curl = curl_easy_init();
if (!curl) {
result.error = "Failed to initialize CURL";
return result;
}
std::string response;
std::string api_url =
"https://api.openbrewerydb.org/v1/breweries?per_page=200&page=" + std::to_string(page);
curl_easy_setopt(curl, CURLOPT_URL, api_url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
result.error = curl_easy_strerror(res);
curl_easy_cleanup(curl);
return result;
}
try {
nlohmann::json breweries = nlohmann::json::parse(response);
result.count = static_cast<int>(breweries.size());
if (result.count > 0) {
std::string out_path = "output/page-" + std::to_string(page) + ".json";
std::ofstream out_file(out_path);
out_file << breweries.dump(2);
}
} catch (const std::exception &ex) {
result.error = ex.what();
}
curl_easy_cleanup(curl);
return result;
}));
}
for (auto &job : jobs) {
PageResult r = job.get();
std::cout << "Fetching page " << r.page << "..." << std::endl;
if (!r.error.empty()) {
std::cerr << "Error on page " << r.page << ": " << r.error << std::endl;
curl_global_cleanup();
return 1;
}
total_count += r.count;
std::cout << " Got " << r.count << " breweries (total: " << total_count << ")" << std::endl;
if (r.count == 0) break;
}
curl_global_cleanup();
return 0;
}