swg/src/MarkdownParser.cxx

/*
 * Copyright (C) 2022-2023 luca0N!
 *
 * This file is part of Static Website Generator (swg).
 *
 * Static Website Generator (swg) is free software: you can redistribute it
 * and/or modify it under the terms of the version 3 of the GNU Lesser General
 * Public License as published by the Free Software Foundation.
 *
 * Static Website Generator (swg) is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 * General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Static Website Generator (swg). If not, see
 * <https://www.gnu.org/licenses/>.
 *
 * Contact luca0N! by e-mail: <luca0n [at] luca0n [dot] com>.
 */

#include "MarkdownParser.hxx"

#include <assert.h>
#include <iostream>

#define ASCII_DIGIT_START	48

enum HyperlinkStage {
	NONE,			// not reading a hyperlink
	READING_CONTENTS,	// reading the contents (like text) of a hyperlink
	EXPECTING_URL,		// done reading contents of hyperlink; expecting its URL to be specified
	READING_URL		// reading the hyperlink url
};

std::string html,
	tag_a_text_buf,		// used for storing the text contents of a hyperlink
	tag_a_buf;		// used for storing the hyperlink address
enum HyperlinkStage tag_a = NONE;

void append(std::string const &s) {
	// TODO: add proper error handling.
	assert(tag_a != EXPECTING_URL);
	switch(tag_a) {
		case NONE:
			html += s;
			break;
		case READING_CONTENTS:
			tag_a_text_buf += s;
			break;
		case READING_URL:
			tag_a_buf += s;
			break;
	}
}
void append(const char c) {
	std::string tmp_str;
	tmp_str += c;
	append(tmp_str);
}

void cleanup() {
	html = "";
}

namespace MarkdownParser {

std::string make_html(std::filesystem::path const &path) {
	cleanup();
	FILE *mdFile = fopen(path.string().c_str(), "r");
	// TODO: Add proper error handling.
	assert(mdFile != NULL);

	int buflen = 64;
	char buf[buflen];

	// Tag flags
	bool tag_b = false,
	     tag_i = false,
	     tag_p = false,
	     tag_li = false,
	     tag_ul = false,
		 tag_s = false,
		 tag_comment = false,
	     newline = true,
	     manualBreak = false,
	     // Used to ignore spaces at the beginning of header titles.
	     ignoreSpace = false;
	unsigned short char_skip = 0;

	// For counting sub-headers (h1, h2, h3, and so on)
	int tag_h = 0;
	while (fgets(buf, buflen, mdFile) != NULL) {
		manualBreak = false;

		if (!tag_comment && tag_p && buf[0] == '\n') {
			// Empty newline; end paragraph.
			html += "</p>\n";
			tag_p = false;
			continue;
		}

		// End ul tag if it's active and a new line doesn't contain an
		// item.
		if (tag_ul && newline && buf[0] != '-' && buf[0] == '\n') {
			tag_ul = false;
			tag_li = false;
			html += "</li></ul>";
		}

		// Read character by character
		for (int x = 0; x < buflen; x++) {
			if (char_skip > 0) {
					char_skip--;
					continue;
			}
			char c = buf[x];
			if (c == '\0') break;
			else if (!tag_comment && c == '\n') {
				// The next buffer iteration will hold the
				// first (buflen) bytes of the next new line.
				newline = true;
				manualBreak = true;
				append(' ');

				// If we were in the middle of inserting a header tag, close it here.
				if (tag_h > 0) {
					html += "</h";
					html += (ASCII_DIGIT_START + tag_h);
					html += ">";
					tag_h = 0;
				}
				if (!tag_li)
					html += '\n';
				break;
			}

			// Start paragraph if newline and no
			// special characters were matched.
			if (!tag_comment && !tag_ul && (!tag_p && newline && x == 0 &&
					c != '#' && c != '-'))
				html += "<p>",
				     tag_p = true;

			switch (c) {
				case '<':
						// Check for HTML comment
						if (buf[x+1] == '!' &&
							buf[x+2] == '-' &&
							buf[x+3] == '-')
								char_skip = 3,
								tag_comment = true,
								html += "<!--";
						else
								append(c);
						break;
				case '*':
					// Bold check
					// Check whether this character has been escaped.
					if (tag_comment || buf[x-1] == '\\') {
						append(c);
						break;
					}

					append(tag_b ? "</b>" : "<b>");
					tag_b = !tag_b;
					break;
				case '_':
					// Italics check
					// Check whether this character has
					// been escaped.

					if (tag_comment || buf[x-1] == '\\') {
						append(c);
						break;
					}

					append(tag_i ? "</i>" : "<i>");
					tag_i = !tag_i;
					break;
				case '#':
					// Header check

					// Headers must be declared at the
					// beginning of a new line. Ignore it
					// if this is not a new line.
					if (tag_comment || !newline) {
						append(c);
						break;
					}

					// Check whether this character has
					// been escaped.
					if (buf[x-1] == '\\' ||
					// Check if this header was specified
					// right at the beginning of the line.
					    (tag_h == 0 && x != 0)) {
						append(c);
						break;
					}

					// This seems like a header
					// declaration.
					//
					// Increase the header count (for
					// subheader support) and add it to the
					// HTML output.
					//
					// Support up to 6 levels of headers.
					// After that, ignore '#' characters
					// and add them directly to the HTML
					// output.
					if (tag_h >= 6) {
						html += "<h";
						html += (ASCII_DIGIT_START + tag_h);
						html += ">";
						html += '#';
						ignoreSpace = true;
						break;
					} else tag_h++;

					// If we are done reading header
					// characters, finally add the tag and
					// then move on.

					if (buf[x+1] != '#') {
						html += "<h";
						html += (ASCII_DIGIT_START + tag_h);
						html += ">";
						ignoreSpace = true;
						break;
					}
					break;
				case '~':
					// Escape character
					if (tag_comment || (x > 0 && buf[x-1] == '\\')) {
						append(c);
						break;
					}
					if (x > 0 && buf[x-1] == '~') {
						append(tag_s ? "</s>" : "<s>");
						tag_s = !tag_s;
						break;
					}
					if (buf[x+1] == '~') break;
					append(c);
					break;
				case '-':
					if (tag_comment &&
						buf[x+1] == '-' &&
						buf[x+2] == '>') {
							tag_comment = false,
							html += "-->",
							char_skip = 2;
							continue;
					}
					if (tag_comment || x != 0) {
						append(c);
						break;
					}

					// Start unordered list tag if it's not active.
					if (!tag_ul) html += "<ul>", tag_ul = true;

					// End previous list item, if active.
					if (tag_li) html += "</li>\n", tag_li = false;

					html += "<li>";
					ignoreSpace = true;
					tag_li = true;
					break;
				case '[':
					// Hyperlink text declaration has begun
					if (tag_comment || tag_a != NONE || buf[x-1] == '\\') {
						// Cannot add hyperlinks inside of hyperlinks;
						append(c);
						break;
					}
					tag_a_buf = "";
					tag_a_text_buf = "";
					tag_a = READING_CONTENTS;
					break;
				case ']':
					// Hyperlink text declaration ended
					if (tag_comment || tag_a != READING_CONTENTS || buf[x-1] == '\\') {
						// Ignore if not reading hyperlink.
						append(c);
						break;
					}
					tag_a = EXPECTING_URL;
					break;
				case '(':
					// Hyperlink address declaration has begun
					if (tag_comment || tag_a != EXPECTING_URL) {
						append(c);
						break;
					}
					tag_a = READING_URL;
					break;
				case ')':
					// Hyperlink address declaration ended
					if (tag_comment || tag_a != READING_URL) {
						append(c);
						break;
					}
					tag_a = NONE;
					append("<a href=\"");
					append(tag_a_buf);
					append("\">");
					append(tag_a_text_buf);
					append("</a>");
					break;
				case ' ':
					if (!tag_comment && ignoreSpace) {
						ignoreSpace = false;
						break;
					} else append(" ");
					break;
				case '\\':
					break;
				default:
					append(c);
					break;
			}
		}
		if (!manualBreak) newline = false;
	}

	fclose(mdFile);
	return html;
}

};