Add multithreaded logging infrastructure for preparation for future designs (#225)

* Update class diagrams

* Implement BoundedChannel and multithreaded logging infra

* Integrate logging channel system

* Update string concatenations to use std::format

* Add pretty print log
This commit is contained in:
2026-05-22 22:00:38 -04:00
committed by GitHub
parent 2ee7b3d2a2
commit 6a66619c70
44 changed files with 1445 additions and 439 deletions

View File

@@ -3,7 +3,10 @@
/**
* @file biergarten_data_generator.h
* @brief Core orchestration class for pipeline data generation.
* @brief Orchestration for end-to-end brewery data generation pipeline.
*
* Intent: Coordinates location loading, enrichment, and generation phases
* to produce a complete dataset. Coordinates dependencies via composition root.
*/
#include <memory>
@@ -15,25 +18,30 @@
#include "services/database/export_service.h"
#include "services/enrichment/enrichment_service.h"
#include "services/logging/logger.h"
/**
* @brief Main data generator class for the Biergarten pipeline.
*
* This class encapsulates the core logic for generating brewery data.
* It handles location loading, city enrichment, and brewery generation.
*/
class BiergartenDataGenerator {
class BiergartenPipelineOrchestrator {
public:
/**
* @brief Construct a BiergartenDataGenerator with injected dependencies.
*
* @param context_service Context provider for sampled locations.
* @param generator Brewery and user data generator.
* @param exporter Storage backend for generated brewery data.
*/
BiergartenDataGenerator(std::unique_ptr<IEnrichmentService> context_service,
std::unique_ptr<DataGenerator> generator,
std::unique_ptr<IExportService> exporter,
const ApplicationOptions& application_options);
/**
* @brief Constructs the orchestrator with injected pipeline dependencies.
*
* @param context_service Provides regional context for locations.
* @param generator Implementation (Llama or Mock) for brewery/user generation.
* @param exporter Database backend for persisting generated records.
* @param application_options CLI configuration and paths.
*/
BiergartenPipelineOrchestrator(
std::shared_ptr<ILogger> logger,
std::unique_ptr<IEnrichmentService> context_service,
std::unique_ptr<DataGenerator> generator,
std::unique_ptr<IExportService> exporter,
const ApplicationOptions& application_options);
/**
* @brief Run the data generation pipeline.
@@ -43,21 +51,31 @@ class BiergartenDataGenerator {
* 2. Resolve context for each city using the injected context service
* 3. Generate brewery data for sampled cities
*
* @note STRUCTURAL CONCURRENCY REQUIREMENT:
* When transitioned to a multithreaded design, this method MUST structurally
* enforce that all deployed worker threads are joined before returning (e.g.
* by using std::jthread or a structured concurrency primitive). This ensures
* workers do not attempt to log to a closed channel during application teardown.
*
* @return true if successful, false if not
*/
bool Run();
private:
/// @brief Logger instance for emitting pipeline messages.
std::shared_ptr<ILogger> logger_;
/// @brief Owning context provider dependency.
std::unique_ptr<IEnrichmentService> context_service_;
/// @brief Generator dependency selected in the composition root.
std::unique_ptr<DataGenerator> generator_;
/// @brief Storage backend for generated brewery records.
std::unique_ptr<IExportService> exporter_;
/// @brief Storage backend for generated brewery records.
std::unique_ptr<IExportService> exporter_;
const ApplicationOptions application_options_;
/// @brief CLI configuration: paths, model settings, generation parameters.
ApplicationOptions application_options_;
/**
* @brief Load locations from JSON and sample cities.

View File

@@ -0,0 +1,73 @@
#ifndef BIERGARTEN_PIPELINE_INCLUDES_CONCURRENCY_BOUNDED_CHANNEL_H_
#define BIERGARTEN_PIPELINE_INCLUDES_CONCURRENCY_BOUNDED_CHANNEL_H_
#include <condition_variable>
#include <cstddef>
#include <mutex>
#include <optional>
#include <queue>
/**
* @file bounded_channel.h
* @brief Thread-safe, bounded multi-producer/multi-consumer synchronous channel.
*
* Intent: Enables asynchronous inter-thread communication with backpressure.
* Models a synchronous channel where producers/consumers block on capacity limits.
*/
/**
* @class BoundedChannel
* @brief MPMC channel with fixed capacity and blocking semantics.
*
* Producers block when buffer is full; consumers block when empty.
* Close() unblocks all waiters and signals channel exhaustion.
*/
template <typename T>
class BoundedChannel {
// -------------------------------------------------------------------------
// Internal state — all access must be guarded by mutex_.
// -------------------------------------------------------------------------
std::queue<T> queue_;
std::mutex mutex_;
std::condition_variable not_full_;
std::condition_variable not_empty_;
std::size_t capacity_;
bool closed_ = false;
public:
/**
* @brief Construct a bounded channel with the given capacity.
* @param capacity Maximum number of items the channel may hold.
*/
explicit BoundedChannel(std::size_t capacity) : capacity_(capacity) {}
/**
* @brief Send an item into the channel. Blocks when the channel is full.
* @param item Move-only item to enqueue.
*/
void Send(T item);
/**
* @brief Receive an item from the channel. Blocks when the channel is
* empty.
* @return std::optional<T> containing the item, or std::nullopt when the
* channel is closed and drained.
*/
std::optional<T> Receive();
/**
* @brief Close the channel and unblock all waiting threads. Idempotent.
*/
void Close();
};
// Include the template implementation
#include "bounded_channel.tcc"
#endif // BIERGARTEN_PIPELINE_INCLUDES_CONCURRENCY_BOUNDED_CHANNEL_H_

View File

@@ -0,0 +1,57 @@
#include "bounded_channel.h"
template <typename T>
void BoundedChannel<T>::Send(T item) {
// Acquire exclusive ownership of the mutex; released automatically on scope exit.
std::unique_lock lock(mutex_);
// Block until there is space in the queue or the channel has been closed.
// The predicate guards against spurious wakeups.
not_full_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
// If the channel was closed while waiting, discard the item and return.
if (closed_) return;
// Move the item into the queue to avoid an unnecessary copy.
queue_.push(std::move(item));
// Wake one blocked Receive() call to signal that data is now available.
not_empty_.notify_one();
}
template <typename T>
std::optional<T> BoundedChannel<T>::Receive() {
// Acquire exclusive ownership of the mutex.
std::unique_lock lock(mutex_);
// Block until the queue is non-empty or the channel has been closed.
// The predicate guards against spurious wakeups.
not_empty_.wait(lock, [&] { return !queue_.empty() || closed_; });
// If woken due to closure and no items remain, signal exhaustion via nullopt.
if (queue_.empty()) return std::nullopt;
// Move the front item out of the queue to avoid an unnecessary copy.
T item = std::move(queue_.front());
queue_.pop();
// Wake one blocked Send() call to signal that a slot has opened.
not_full_.notify_one();
return item;
}
template <typename T>
void BoundedChannel<T>::Close() {
// Acquire exclusive ownership of the mutex to ensure visibility of the flag.
std::unique_lock lock(mutex_);
// Mark the channel as closed; subsequent Send() calls will be dropped.
closed_ = true;
// Wake all blocked Send() callers so they can observe the closed flag and exit.
not_full_.notify_all();
// Wake all blocked Receive() callers so they can drain remaining items or return nullopt.
not_empty_.notify_all();
}

View File

@@ -18,6 +18,7 @@
#include "data_generation/data_generator.h"
#include "data_generation/prompt_formatting/prompt_formatter.h"
#include "data_model/models.h"
#include "services/logging/logger.h"
struct llama_model;
struct llama_context;
@@ -37,7 +38,7 @@ class LlamaGenerator final : public DataGenerator {
* @param prompt_directory Directory service for loading named prompt files.
*/
LlamaGenerator(const ApplicationOptions& options,
const std::string& model_path,
const std::string& model_path, std::shared_ptr<ILogger> logger,
std::unique_ptr<IPromptFormatter> prompt_formatter,
std::unique_ptr<IPromptDirectory> prompt_directory);
@@ -130,6 +131,7 @@ class LlamaGenerator final : public DataGenerator {
std::mt19937 rng_;
uint32_t n_ctx_ = kDefaultContextSize;
int n_gpu_layers_ = 0;
std::shared_ptr<ILogger> logger_;
std::unique_ptr<IPromptFormatter> prompt_formatter_;
std::unique_ptr<IPromptDirectory> prompt_directory_;
};

View File

@@ -10,11 +10,14 @@
#include <boost/program_options.hpp>
#include <cstdint>
#include <filesystem>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
class ILogger;
namespace prog_opts = boost::program_options;
// ============================================================================
@@ -136,6 +139,7 @@ struct ApplicationOptions {
// Function Declarations
// ============================================================================
std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv);
std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv,
std::shared_ptr<ILogger> logger = nullptr);
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_MODELS_H_

View File

@@ -7,16 +7,19 @@
*/
#include <filesystem>
#include <memory>
#include <vector>
#include "data_model/models.h"
#include "services/logging/logger.h"
/// @brief Loads curated world locations from a JSON file into memory.
class JsonLoader {
public:
/// @brief Parses a JSON array file and returns all location records.
static std::vector<Location> LoadLocations(
const std::filesystem::path& filepath);
const std::filesystem::path& filepath,
std::shared_ptr<ILogger> logger = nullptr);
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_JSON_HANDLING_JSON_LOADER_H_

View File

@@ -0,0 +1,109 @@
#ifndef BIERGARTEN_PIPELINE_INCLUDES_JSON_HANDLING_PRETTY_PRINT_H_
#define BIERGARTEN_PIPELINE_INCLUDES_JSON_HANDLING_PRETTY_PRINT_H_
/**
* @file json_handling/pretty_print.h
* @brief Pretty-printing utilities for JSON values.
*
* Provides formatting capability for boost::json::value with indentation and
* readable output. Adapted from Boost JSON library examples.
*/
#include <boost/json.hpp>
#include <ostream>
#include <string>
/**
* @brief Pretty-prints a JSON value to an output stream with indentation.
*
* Recursively formats JSON objects and arrays with consistent 4-space
* indentation. Adapted from:
* https://raw.githubusercontent.com/boostorg/json/refs/heads/develop/example/pretty.cpp
*
* @param outstream Output stream to write formatted JSON.
* @param json_val JSON value to format.
* @param indent Optional indentation string (managed internally on first call).
*/
inline void PrettyPrint(std::ostream& outstream,
boost::json::value const& json_val,
std::string* indent = nullptr) {
std::string str;
if (indent == nullptr) {
indent = &str;
}
switch (json_val.kind()) {
case boost::json::kind::object: {
outstream << "{\n";
indent->append(4, ' ');
auto const& obj = json_val.get_object();
if (!obj.empty()) {
const auto* iter = obj.begin();
for (;;) {
outstream << *indent << boost::json::serialize(iter->key()) << " : ";
PrettyPrint(outstream, iter->value(), indent);
iter = std::next(iter);
if (iter == obj.end()) {
break;
}
outstream << ",\n";
}
}
outstream << "\n";
indent->resize(indent->size() - 4);
outstream << *indent << "}";
break;
}
case boost::json::kind::array: {
outstream << "[\n";
indent->append(4, ' ');
auto const& arr = json_val.get_array();
if (!arr.empty()) {
const auto* iter = arr.begin();
for (;;) {
outstream << *indent;
PrettyPrint(outstream, *iter, indent);
iter = std::next(iter);
if (iter == arr.end()) {
break;
}
outstream << ",\n";
}
}
outstream << "\n";
indent->resize(indent->size() - 4);
outstream << *indent << "]";
break;
}
case boost::json::kind::string: {
outstream << serialize(json_val.get_string());
break;
}
case boost::json::kind::uint64:
case boost::json::kind::int64:
case boost::json::kind::double_:
outstream << json_val;
break;
case boost::json::kind::bool_:
if (json_val.get_bool()) {
outstream << "true";
} else {
outstream << "false";
}
break;
case boost::json::kind::null:
outstream << "null";
break;
}
if (indent->empty()) {
outstream << "\n";
}
}
#endif

View File

@@ -12,13 +12,15 @@
#include <unordered_map>
#include "enrichment_service.h"
#include "services/logging/logger.h"
#include "web_client/web_client.h"
/// @brief Provides Wikipedia summary lookups backed by cached raw extracts.
class WikipediaEnrichmentService final : public IEnrichmentService {
public:
/// @brief Creates a new Wikipedia service with the provided web client.
explicit WikipediaEnrichmentService(std::unique_ptr<WebClient> client);
explicit WikipediaEnrichmentService(std::unique_ptr<WebClient> client,
std::shared_ptr<ILogger> logger);
/// @brief Returns the Wikipedia-derived context for a location.
[[nodiscard]] std::string GetLocationContext(const Location& loc) override;
@@ -26,6 +28,7 @@ class WikipediaEnrichmentService final : public IEnrichmentService {
private:
std::string FetchExtract(std::string_view query);
std::unique_ptr<WebClient> client_;
std::shared_ptr<ILogger> logger_;
/// @brief Canonical cache for raw Wikipedia query extracts.
std::unordered_map<std::string, std::string> extract_cache_;
};

View File

@@ -0,0 +1,53 @@
/**
* @file services/logging/log_dispatcher.h
* @brief Dedicated log dispatcher for asynchronous pipeline logging.
*
* The dispatcher drains LogEntry values from a bounded channel and forwards
* them to spdlog on a dedicated thread.
*/
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOG_DISPATCHER_H_
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOG_DISPATCHER_H_
#include <spdlog/spdlog.h>
#include "concurrency/bounded_channel.h"
#include "services/logging/log_entry.h"
/**
* @class LogDispatcher
* @brief Consumes log entries from a channel and forwards them to spdlog.
*
* Non-copyable and non-movable. Intended to run on its own dedicated thread
* and exit once the channel has been closed and drained.
*/
class LogDispatcher {
public:
/**
* @brief Construct a log dispatcher.
*
* @param channel Reference to the bounded channel used for log retrieval.
*/
explicit LogDispatcher(BoundedChannel<LogEntry>& channel);
LogDispatcher(const LogDispatcher&) = delete;
LogDispatcher& operator=(const LogDispatcher&) = delete;
LogDispatcher(LogDispatcher&&) = delete;
LogDispatcher& operator=(LogDispatcher&&) = delete;
~LogDispatcher() = default;
/**
* @brief Drain the channel and forward entries to spdlog.
*
* Intended to be called once on a dedicated thread. The loop returns after
* the channel has been closed and all queued entries have been processed.
*/
void Run();
private:
BoundedChannel<LogEntry>& channel_;
static spdlog::level::level_enum ToSpdlogLevel(LogLevel level);
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOG_DISPATCHER_H_

View File

@@ -0,0 +1,88 @@
/**
* @file services/logging/log_entry.h
* @brief Structured log record shared by the pipeline logging infra.
*
* LogEntry is a lightweight value type that can be passed safely between the
* logging producer and dispatcher through BoundedChannel<LogEntry>.
*/
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOG_ENTRY_H_
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOG_ENTRY_H_
#include <chrono>
#include <source_location>
#include <string>
#include <thread>
#include <vector>
/**
* @enum LogLevel
* @brief Severity levels supported by the logging infra.
*/
enum class LogLevel {
Debug, ///< Development/debugging information.
Info, ///< General informational messages.
Warn, ///< Warning conditions.
Error, ///< Error conditions.
};
/**
* @enum PipelinePhase
* @brief Pipeline execution phases used to tag log records.
*
* The phase tag makes it easier to correlate log output with the part of the
* pipeline that emitted it.
*/
enum class PipelinePhase {
Startup, ///< Initialization and validation.
UserGeneration, ///< User profile generation.
BreweryAndBeerGeneration, ///< Brewery and beer data generation.
CheckinGeneration, ///< Checkin (visit) record generation.
RatingGeneration, ///< Rating and review generation.
FollowGeneration, ///< Follow relationship generation.
Teardown, ///< Finalization and cleanup.
};
/**
* @struct LogDTO
* @brief User-provided subset of log fields. Used to capture call-site info transparently.
*/
struct LogDTO {
LogLevel level;
PipelinePhase phase;
std::string message;
};
/**
* @struct LogEntry
* @brief Single structured log event.
*
* All fields are value types, which keeps transfer across the bounded channel
* simple and avoids shared ownership.
*
* NOTE: timestamp, thread_id, and origin must be populated by ILogger::Log()
* before the entry is dispatched.
*/
struct LogEntry {
/// @brief Timestamp when the entry was created.
std::chrono::system_clock::time_point timestamp{};
/// @brief Source location where the log call was made.
std::source_location origin{};
/// @brief Thread responsible for emitting the log.
std::thread::id thread_id{};
/// @brief Severity level of this entry.
LogLevel level;
/// @brief Pipeline phase associated with the entry.
PipelinePhase phase;
/// @brief Log message text.
std::string message;
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOG_ENTRY_H_

View File

@@ -0,0 +1,53 @@
/**
* @file services/logging/log_producer.h
* @brief Channel-backed log producer for asynchronous pipeline logging.
*
* The producer captures log records from application code and forwards them to
* a bounded channel for later processing by the dispatcher.
*/
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_CHANNEL_LOGGER_H_
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_CHANNEL_LOGGER_H_
#include <string_view>
#include "concurrency/bounded_channel.h"
#include "services/logging/log_entry.h"
#include "services/logging/logger.h"
/**
* @class LogProducer
* @brief ILogger implementation that forwards entries to a bounded channel.
*
* Non-copyable and non-movable. The channel reference is non-owning and must
* remain valid for the lifetime of the producer.
*/
class LogProducer final : public ILogger {
public:
/**
* @brief Construct a channel-backed producer.
*
* @param channel Reference to the bounded channel used for log transfer.
*/
explicit LogProducer(BoundedChannel<LogEntry>& channel);
LogProducer(const LogProducer&) = delete;
LogProducer& operator=(const LogProducer&) = delete;
LogProducer(LogProducer&&) = delete;
LogProducer& operator=(LogProducer&&) = delete;
~LogProducer() override = default;
/**
* @brief Queue a log message for asynchronous processing.
*
* Blocks while the channel applies backpressure. This blocking behavior
* under heavy load is an accepted trade-off for simplicity.
*/
void DoLog(LogEntry log_entry) override;
private:
BoundedChannel<LogEntry>& channel_;
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_CHANNEL_LOGGER_H_

View File

@@ -0,0 +1,64 @@
/**
* @file services/logging/logger.h
* @brief Abstract logging interface used by pipeline components.
*
* The interface keeps application code independent from the concrete logging
* transport, buffering, and formatting implementation.
*/
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOGGER_H_
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOGGER_H_
#include <source_location>
#include <string>
#include <utility>
#include "services/logging/log_entry.h"
/**
* @class ILogger
* @brief Minimal interface for submitting structured log messages.
*
* Implementations are non-copyable and non-movable. They are typically owned
* by the composition root and injected into services that emit diagnostics.
*/
class ILogger {
public:
ILogger() = default;
ILogger(const ILogger&) = delete;
ILogger& operator=(const ILogger&) = delete;
ILogger(ILogger&&) = delete;
ILogger& operator=(ILogger&&) = delete;
virtual ~ILogger() = default;
/**
* @brief Submit a log message to the logging subsystem.
*
* @param payload User-provided log data (level, phase, message).
* @param origin Auto-captured source location of the call site.
*/
void Log(LogDTO payload,
std::source_location origin = std::source_location::current(),
std::chrono::system_clock::time_point timestamp = std::chrono::system_clock::now(),
std::thread::id thread_id = std::this_thread::get_id()) {
LogEntry entry;
entry.timestamp = timestamp;
entry.thread_id = thread_id;
entry.level = payload.level;
entry.phase = payload.phase;
entry.message = std::move(payload.message);
entry.origin = origin;
DoLog(std::move(entry));
}
protected:
/**
* @brief Underlying implementation to transport the log entry.
*
* Implementations must be thread-safe as DoLog can be called concurrently
* from multiple worker threads.
*/
virtual void DoLog(LogEntry log_entry) = 0;
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_LOGGING_LOGGER_H_

View File

@@ -12,11 +12,14 @@
*/
#include <filesystem>
#include <memory>
#include <stdexcept>
#include <string>
#include <string_view>
#include <unordered_map>
#include "services/logging/logger.h"
/**
* @brief Interface for loading named prompt files.
*/
@@ -56,6 +59,8 @@ class PromptDirectory final : public IPromptDirectory {
* directory.
*/
explicit PromptDirectory(const std::filesystem::path& prompt_dir);
PromptDirectory(const std::filesystem::path& prompt_dir,
std::shared_ptr<ILogger> logger);
/**
* @brief Loads the prompt for @p key, caching the result.
@@ -70,6 +75,7 @@ class PromptDirectory final : public IPromptDirectory {
private:
std::filesystem::path prompt_dir_;
std::shared_ptr<ILogger> logger_;
std::unordered_map<std::string, std::string> cache_;
};

View File

@@ -8,8 +8,11 @@
#include "web_client/web_client.h"
#include "services/logging/logger.h"
#include <memory>
#include <string>
#include <utility>
/**
* @brief WebClient implementation backed by cpp-httplib.
@@ -24,7 +27,8 @@
*/
class HttpWebClient final : public WebClient {
public:
HttpWebClient() = default;
explicit HttpWebClient(std::shared_ptr<ILogger> logger)
: logger_(std::move(logger)) {}
~HttpWebClient() override = default;
/**
@@ -43,6 +47,9 @@ public:
* @return Percent-encoded string safe for use in a URL.
*/
std::string EncodeURL(const std::string& value) override;
private:
std::shared_ptr<ILogger> logger_;
};