// Copyright 2020 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#if !defined(_WIN32)

#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include "libreprl.h"

#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
#include <signal.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

// Well-known file descriptor numbers for reprl <-> child communication, child process side
#define REPRL_CHILD_CTRL_IN 100
#define REPRL_CHILD_CTRL_OUT 101
#define REPRL_CHILD_DATA_IN 102
#define REPRL_CHILD_DATA_OUT 103

/// Maximum timeout in microseconds. Mostly just limited by the fact that the timeout in milliseconds has to fit into a 32-bit integer.
#define REPRL_MAX_TIMEOUT_IN_MICROSECONDS ((uint64_t)(INT_MAX) * 1000)

static size_t min(size_t x, size_t y) {
  return x < y ? x : y;
}

#ifdef __linux__
// This function creates the UID/GID mapping that we need inside of the user
// namespace. This is needed such that the files we create have a proper owner
// attached to them.
static void write_id_maps(uid_t uid, gid_t gid) {
    char setgroups_path[] = "/proc/self/setgroups";
    char uid_map_path[] = "/proc/self/uid_map";
    char gid_map_path[] = "/proc/self/gid_map";

    int setgroups_fd = open(setgroups_path, O_WRONLY);
    int uid_map_fd = open(uid_map_path, O_WRONLY);
    int gid_map_fd = open(gid_map_path, O_WRONLY);

    if (setgroups_fd == -1 || uid_map_fd == -1 || gid_map_fd == -1) {
        fprintf(stderr, "Error opening setgroups/uid_map/gid_map file: %s\n", strerror(errno));
        _exit(-1);
    }

    // More context on this: https://lwn.net/Articles/626665/
    dprintf(setgroups_fd, "deny");
    dprintf(uid_map_fd, "%d %d 1", uid, uid);
    dprintf(gid_map_fd, "%d %d 1", gid, gid);

    close(setgroups_fd);
    close(uid_map_fd);
    close(gid_map_fd);
}

// Creates a tmpfs at `mount_point` in a new user namespace.
static void create_tmpfs(const char* mount_point) {
    // Get the UID and GID before we call unshare.
    uid_t uid = getuid();
    gid_t gid = getgid();

    // We create a new user (CLONE_NEWUSER) and mount (CLONE_NEWNS)
    // namespace here such that we can mount our own tmpfs onto
    // mount_point that is only visible to this process.
    if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == -1) {
        fprintf(stderr, "unshare failed to create a new mount namespace in the child: %s\n", strerror(errno));
        _exit(-1);
    };

    // Now write the UID / GID mappings
    write_id_maps(uid, gid);

    // Mount a new tmpfs onto `mount_point` this allows us to add files
    // here that get automatically cleaned up once the process exits.
    if (mount("tmpfs", mount_point, "tmpfs", 0, NULL) == -1) {
        fprintf(stderr, "mount failed to create a tmpfs in namespace in the child: %s\n", strerror(errno));
        _exit(-1);
    }
}
#endif

static uint64_t current_usecs()
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
}

static char** copy_string_array(const char** orig)
{
    size_t num_entries = 0;
    for (const char** current = orig; *current; current++) {
        num_entries += 1;
    }
    char** copy = calloc(num_entries + 1, sizeof(char*));
    for (size_t i = 0; i < num_entries; i++) {
        copy[i] = strdup(orig[i]);
    }
    return copy;
}

static void free_string_array(char** arr)
{
    if (!arr) return;
    for (char** current = arr; *current; current++) {
        free(*current);
    }
    free(arr);
}

// A unidirectional communication channel for larger amounts of data, up to a maximum size (REPRL_MAX_DATA_SIZE).
// Implemented as a (RAM-backed) file for which the file descriptor is shared with the child process and which is mapped into our address space.
struct data_channel {
    // File descriptor of the underlying file. Directly shared with the child process.
    int fd;
    // Memory mapping of the file, always of size REPRL_MAX_DATA_SIZE.
    char* mapping;
};

struct reprl_context {
    // Whether reprl_initialize has been successfully performed on this context.
    int initialized;

    // Read file descriptor of the control pipe. Only valid if a child process is running (i.e. pid is nonzero).
    int ctrl_in;
    // Write file descriptor of the control pipe. Only valid if a child process is running (i.e. pid is nonzero).
    int ctrl_out;

    // Data channel REPRL -> Child
    struct data_channel* data_in;
    // Data channel Child -> REPRL
    struct data_channel* data_out;

    // Optional data channel for the child's stdout and stderr.
    struct data_channel* child_stdout;
    struct data_channel* child_stderr;

    // PID of the child process. Will be zero if no child process is currently running.
    pid_t pid;

    // Arguments and environment for the child process.
    char** argv;
    char** envp;

    // A malloc'd string containing a description of the last error that occurred.
    char* last_error;
};

static int reprl_error(struct reprl_context* ctx, const char *format, ...)
{
    va_list args;
    va_start(args, format);
    free(ctx->last_error);
    vasprintf(&ctx->last_error, format, args);
    return -1;
}

static struct data_channel* reprl_create_data_channel(struct reprl_context* ctx)
{
#ifdef __linux__
    int fd = memfd_create("REPRL_DATA_CHANNEL", MFD_CLOEXEC);
#else
    char path[] = "/tmp/reprl_data_channel_XXXXXXXX";
    int fd = mkostemp(path, O_CLOEXEC);
    unlink(path);
#endif
    if (fd == -1 || ftruncate(fd, REPRL_MAX_DATA_SIZE) != 0) {
        reprl_error(ctx, "Failed to create data channel file: %s", strerror(errno));
        return NULL;
    }
    char* mapping = mmap(0, REPRL_MAX_DATA_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (mapping == MAP_FAILED) {
        reprl_error(ctx, "Failed to mmap data channel file: %s", strerror(errno));
        return NULL;
    }

    struct data_channel* channel = malloc(sizeof(struct data_channel));
    channel->fd = fd;
    channel->mapping = mapping;
    return channel;
}

static void reprl_destroy_data_channel(struct data_channel* channel)
{
    if (!channel) return;
    close(channel->fd);
    munmap(channel->mapping, REPRL_MAX_DATA_SIZE);
    free(channel);
}

static void reprl_child_terminated(struct reprl_context* ctx)
{
    if (!ctx->pid) return;
    ctx->pid = 0;
    close(ctx->ctrl_in);
    close(ctx->ctrl_out);
}

static void reprl_terminate_child(struct reprl_context* ctx)
{
    if (!ctx->pid) return;
    int status;
    kill(ctx->pid, SIGKILL);
    waitpid(ctx->pid, &status, 0);
    reprl_child_terminated(ctx);
}

static int reprl_spawn_child(struct reprl_context* ctx)
{
    // This is also a good time to ensure the data channel backing files don't grow too large.
    ftruncate(ctx->data_in->fd, REPRL_MAX_DATA_SIZE);
    ftruncate(ctx->data_out->fd, REPRL_MAX_DATA_SIZE);
    if (ctx->child_stdout) ftruncate(ctx->child_stdout->fd, REPRL_MAX_DATA_SIZE);
    if (ctx->child_stderr) ftruncate(ctx->child_stderr->fd, REPRL_MAX_DATA_SIZE);

    int crpipe[2] = { 0, 0 };          // control pipe child -> reprl
    int cwpipe[2] = { 0, 0 };          // control pipe reprl -> child

    if (pipe(crpipe) != 0) {
        return reprl_error(ctx, "Could not create pipe for REPRL communication: %s", strerror(errno));
    }
    if (pipe(cwpipe) != 0) {
        close(crpipe[0]);
        close(crpipe[1]);
        return reprl_error(ctx, "Could not create pipe for REPRL communication: %s", strerror(errno));
    }

    ctx->ctrl_in = crpipe[0];
    ctx->ctrl_out = cwpipe[1];
    fcntl(ctx->ctrl_in, F_SETFD, FD_CLOEXEC);
    fcntl(ctx->ctrl_out, F_SETFD, FD_CLOEXEC);

#ifdef __linux__
    // This is where we will mount our own tmpfs, this is intended to be used
    // for targets like Chrome, where we have to pass the user data directory.
    // Even if the target does not clean up after themselves, the tmpfs in the
    // user namespace will be removed once the process exits. Also, every child
    // process, i.e. fuzzing instance, can then have it's own tmpfs.
    // This only works on Linux right now, which is where we fuzz Chrome, this
    // won't work on any other OS.
    const char mount_point[] = "/tmp/fuzzilli_tmp";

    // Create the mountpoint for our tmpfs here. This is just an empty dir.
    // We also do not really care if this directory exists, we just need it as
    // a mountpoint.
    if (mkdir(mount_point, 0)) {
        if (errno != EEXIST) {
          fprintf(stderr, "mkdir failed to create %s to create a mountpoint: %s\n", mount_point, strerror(errno));
        }
    }
#endif

#ifdef __linux__
    // Use vfork() on Linux as that considerably improves the fuzzer performance. See also https://github.com/googleprojectzero/fuzzilli/issues/174
    // Due to vfork, the code executed in the child process *must not* modify any memory apart from its stack, as it will share the page table of its parent.
    pid_t pid = vfork();
#else
    pid_t pid = fork();
#endif
    if (pid == 0) {
        if (dup2(cwpipe[0], REPRL_CHILD_CTRL_IN) < 0 ||
            dup2(crpipe[1], REPRL_CHILD_CTRL_OUT) < 0 ||
            dup2(ctx->data_out->fd, REPRL_CHILD_DATA_IN) < 0 ||
            dup2(ctx->data_in->fd, REPRL_CHILD_DATA_OUT) < 0) {
            fprintf(stderr, "dup2 failed in the child: %s\n", strerror(errno));
            _exit(-1);
        }

#ifdef __linux__
        // Set RLIMIT_CORE to 0, such that we don't produce core dumps. The
        // added benefit of doing this here, in the child process, is that we
        // can still get core dumps when Fuzzilli crashes.
        struct rlimit core_limit;
        core_limit.rlim_cur = 0;
        core_limit.rlim_max = 0;
        if (setrlimit(RLIMIT_CORE, &core_limit) < 0) {
            fprintf(stderr, "setrlimit failed in the child: %s\n", strerror(errno));
            _exit(-1);
        };
#endif

        // Unblock any blocked signals. It seems that libdispatch sometimes blocks delivery of certain signals.
        sigset_t newset;
        sigemptyset(&newset);
        if (sigprocmask(SIG_SETMASK, &newset, NULL) != 0) {
            fprintf(stderr, "sigprocmask failed in the child: %s\n", strerror(errno));
            _exit(-1);
        }

        close(cwpipe[0]);
        close(crpipe[1]);

        int devnull = open("/dev/null", O_RDWR);
        dup2(devnull, 0);
        if (ctx->child_stdout) dup2(ctx->child_stdout->fd, 1);
        else dup2(devnull, 1);
        if (ctx->child_stderr) dup2(ctx->child_stderr->fd, 2);
        else dup2(devnull, 2);
        close(devnull);

#ifdef __linux__
        // Create the tmpfs at the specific mount point here in the child process
        // such that we have a tmpfs for this process only that will be cleaned up at process exit.
        // This will also write into the necessary files in /proc, so we need to do this here after we've fork()'ed.
        // This will only work on Linux, see the comment above where call mkdir.
        create_tmpfs(mount_point);
#endif

        // close all other FDs. We try to use FD_CLOEXEC everywhere, but let's be extra sure we don't leak any fds to the child.
        int tablesize = getdtablesize();
        for (int i = 3; i < tablesize; i++) {
            if (i == REPRL_CHILD_CTRL_IN || i == REPRL_CHILD_CTRL_OUT || i == REPRL_CHILD_DATA_IN || i == REPRL_CHILD_DATA_OUT) {
                continue;
            }
            close(i);
        }

        execve(ctx->argv[0], ctx->argv, ctx->envp);

        fprintf(stderr, "Failed to execute child process %s: %s\n", ctx->argv[0], strerror(errno));
        fflush(stderr);
        _exit(-1);
    }

    close(crpipe[1]);
    close(cwpipe[0]);

    if (pid < 0) {
        close(ctx->ctrl_in);
        close(ctx->ctrl_out);
        return reprl_error(ctx, "Failed to fork: %s", strerror(errno));
    }
    ctx->pid = pid;

    char helo[5] = { 0 };
    if (read(ctx->ctrl_in, helo, 4) != 4) {
        reprl_terminate_child(ctx);
        return reprl_error(ctx, "Did not receive HELO message from child: %s", strerror(errno));
    }

    if (strncmp(helo, "HELO", 4) != 0) {
        reprl_terminate_child(ctx);
        return reprl_error(ctx, "Received invalid HELO message from child: %s", helo);
    }

    if (write(ctx->ctrl_out, helo, 4) != 4) {
        reprl_terminate_child(ctx);
        return reprl_error(ctx, "Failed to send HELO reply message to child: %s", strerror(errno));
    }

#ifdef __linux__
    struct rlimit core_limit = {};
    if (prlimit(pid, RLIMIT_CORE, NULL, &core_limit) < 0) {
        reprl_terminate_child(ctx);
        return reprl_error(ctx, "prlimit failed: %s\n", strerror(errno));
    }
    if (core_limit.rlim_cur != 0 || core_limit.rlim_max != 0) {
        reprl_terminate_child(ctx);
        return reprl_error(ctx, "Detected non-zero RLIMIT_CORE. Check that the child does not set RLIMIT_CORE manually.\n");
    }
#endif

    return 0;
}

struct reprl_context* reprl_create_context()
{
    // "Reserve" the well-known REPRL fds so no other fd collides with them.
    // This would cause various kinds of issues in reprl_spawn_child.
    // It would be enough to do this once per process in the case of multiple
    // REPRL instances, but it's probably not worth the implementation effort.
    int devnull = open("/dev/null", O_RDWR);
    dup2(devnull, REPRL_CHILD_CTRL_IN);
    dup2(devnull, REPRL_CHILD_CTRL_OUT);
    dup2(devnull, REPRL_CHILD_DATA_IN);
    dup2(devnull, REPRL_CHILD_DATA_OUT);
    close(devnull);

    return calloc(1, sizeof(struct reprl_context));
}

int reprl_initialize_context(struct reprl_context* ctx, const char** argv, const char** envp, int capture_stdout, int capture_stderr)
{
    if (ctx->initialized) {
        return reprl_error(ctx, "Context is already initialized");
    }

    // We need to ignore SIGPIPE since we could end up writing to a pipe after our child process has exited.
    signal(SIGPIPE, SIG_IGN);

    ctx->argv = copy_string_array(argv);
    ctx->envp = copy_string_array(envp);

    ctx->data_in = reprl_create_data_channel(ctx);
    ctx->data_out = reprl_create_data_channel(ctx);
    if (capture_stdout) {
        ctx->child_stdout = reprl_create_data_channel(ctx);
    }
    if (capture_stderr) {
        ctx->child_stderr = reprl_create_data_channel(ctx);
    }
    if (!ctx->data_in || !ctx->data_out || (capture_stdout && !ctx->child_stdout) || (capture_stderr && !ctx->child_stderr)) {
        // Proper error message will have been set by reprl_create_data_channel
        return -1;
    }

    ctx->initialized = 1;
    return 0;
}

void reprl_destroy_context(struct reprl_context* ctx)
{
    reprl_terminate_child(ctx);

    free_string_array(ctx->argv);
    free_string_array(ctx->envp);

    reprl_destroy_data_channel(ctx->data_in);
    reprl_destroy_data_channel(ctx->data_out);
    reprl_destroy_data_channel(ctx->child_stdout);
    reprl_destroy_data_channel(ctx->child_stderr);

    free(ctx->last_error);
    free(ctx);
}

int reprl_execute(struct reprl_context* ctx, const char* script, uint64_t script_size, uint64_t timeout, uint64_t* execution_time, int fresh_instance)
{
    if (!ctx->initialized) {
        return reprl_error(ctx, "REPRL context is not initialized");
    }

    if (script_size > REPRL_MAX_DATA_SIZE) {
        return reprl_error(ctx, "Script too large");
    }

    if (timeout > REPRL_MAX_TIMEOUT_IN_MICROSECONDS) {
        return reprl_error(ctx, "Timeout too large");
    }
    int timeout_ms = (int)(timeout / 1000);

    // Terminate any existing instance if requested.
    if (fresh_instance && ctx->pid) {
        reprl_terminate_child(ctx);
    }

    // Reset file position so the child can simply read(2) and write(2) to these fds.
    lseek(ctx->data_out->fd, 0, SEEK_SET);
    lseek(ctx->data_in->fd, 0, SEEK_SET);
    if (ctx->child_stdout) {
        lseek(ctx->child_stdout->fd, 0, SEEK_SET);
    }
    if (ctx->child_stderr) {
        lseek(ctx->child_stderr->fd, 0, SEEK_SET);
    }

    // Spawn a new instance if necessary.
    if (!ctx->pid) {
        int r = reprl_spawn_child(ctx);
        if (r != 0) return r;
    }

    // Copy the script to the data channel.
    memcpy(ctx->data_out->mapping, script, script_size);

    // Tell child to execute the script.
    if (write(ctx->ctrl_out, "exec", 4) != 4 ||
        write(ctx->ctrl_out, &script_size, 8) != 8) {
        // These can fail if the child unexpectedly terminated between executions.
        // Check for that here to be able to provide a better error message.
        int status;
        if (waitpid(ctx->pid, &status, WNOHANG) == ctx->pid) {
            reprl_child_terminated(ctx);
            if (WIFEXITED(status)) {
                return reprl_error(ctx, "Child unexpectedly exited with status %i between executions", WEXITSTATUS(status));
            } else {
                return reprl_error(ctx, "Child unexpectedly terminated with signal %i between executions", WTERMSIG(status));
            }
        }
        return reprl_error(ctx, "Failed to send command to child process: %s", strerror(errno));
    }

    // Wait for child to finish execution (or crash).
    uint64_t start_time = current_usecs();
    struct pollfd fds = {.fd = ctx->ctrl_in, .events = POLLIN, .revents = 0};
    int res = poll(&fds, 1, timeout_ms);
    *execution_time = current_usecs() - start_time;
    if (res == 0) {
        // Execution timed out. Kill child and return a timeout status.
        reprl_terminate_child(ctx);
        return 1 << 16;
    } else if (res != 1) {
        // An error occurred.
        // We expect all signal handlers to be installed with SA_RESTART, so receiving EINTR here is unexpected and thus also an error.
        return reprl_error(ctx, "Failed to poll: %s", strerror(errno));
    }

    // Poll succeeded, so there must be something to read now (either the status or EOF).
    int status;
    ssize_t rv = read(ctx->ctrl_in, &status, 4);
    if (rv < 0) {
        return reprl_error(ctx, "Failed to read from control pipe: %s", strerror(errno));
    } else if (rv != 4) {
        // Most likely, the child process crashed and closed the write end of the control pipe.
        // Unfortunately, there probably is nothing that guarantees that waitpid() will immediately succeed now,
        // and we also don't want to block here. So just retry waitpid() a few times...
        int success = 0;
        do {
            success = waitpid(ctx->pid, &status, WNOHANG) == ctx->pid;
            if (!success) usleep(10);
        } while (!success && current_usecs() - start_time < timeout);

        if (!success) {
            // Wait failed, so something weird must have happened. Maybe somehow the control pipe was closed without the child exiting?
            // Probably the best we can do is kill the child and return an error.
            reprl_terminate_child(ctx);
            return reprl_error(ctx, "Child in weird state after execution");
        }

        // Cleanup any state related to this child process.
        reprl_child_terminated(ctx);

        if (WIFEXITED(status)) {
            status = WEXITSTATUS(status) << 8;
        } else if (WIFSIGNALED(status)) {
            status = WTERMSIG(status);
        } else {
            // This shouldn't happen, since we don't specify WUNTRACED for waitpid...
            return reprl_error(ctx, "Waitpid returned unexpected child state %i", status);
        }
    }

    // The status must be a positive number, see the status encoding format below.
    // We also don't allow the child process to indicate a timeout. If we wanted,
    // we could treat it as an error if the upper bits are set.
    status &= 0xffff;

    return status;
}

static const char* fetch_data_channel_content(struct data_channel* channel)
{
    if (!channel) return "";
    size_t pos = lseek(channel->fd, 0, SEEK_CUR);
    pos = min(pos, REPRL_MAX_DATA_SIZE - 1);
    channel->mapping[pos] = 0;
    return channel->mapping;
}

const char* reprl_fetch_fuzzout(struct reprl_context* ctx)
{
    return fetch_data_channel_content(ctx->data_in);
}

const char* reprl_fetch_stdout(struct reprl_context* ctx)
{
    return fetch_data_channel_content(ctx->child_stdout);
}

const char* reprl_fetch_stderr(struct reprl_context* ctx)
{
    return fetch_data_channel_content(ctx->child_stderr);
}

const char* reprl_get_last_error(struct reprl_context* ctx)
{
    return ctx->last_error;
}

#endif