Webux Lab

By Studio Webux

Simple Deno script to parse my markdown to JSON

TG
Tommy Gingras Studio Webux 2024-10-27

A Simple Script to Index Markdown Content with Algolia

I decided to explore how I can make my blog more discoverable using Algolia. In this post, I’ll share a simple script that indexes my markdown content and sends it to Algolia.

The Challenge

Algolia has a limit of 10KB per record, which means we need to find ways to parse and transform our markdown content into manageable chunks. This is where this script comes in – it takes care of splitting sections and indexing data for search engines like Algolia.

How It Works

The script uses Deno as a runtime and node.js libraries to read and parse markdown files. It splits content into sections, sanitizes text, and indexes data in the index.json format I decided to use.

This script can be adapted to work on other websites or with different types of content.

The script

// deno run -A tools/indexer.ts
// 2024-10-27 to generate the index.json
import { writeFileSync, readFileSync } from 'node:fs';
import { join } from 'node:path';
import { Buffer } from 'node:buffer';

const regexp = /^#+/;

type Article = {
  visibility: boolean;
  categories: string[];
  links: string[];
  author: string;
  initials: string;
  date: string;
  color: string;
  company: string;
  metaDescription: string;
  metaKeywords: string;
  title: string;
  subtitle: string;
  text: string;
  url: string;
  tags: { icon: string; text: string }[];
};

type Section = {
  title: string;
  description: string;
  metaDescription: string;
  metaKeywords: string;
  items: Article[];
};

function sanitize(txt: string) {
  return txt
    .replace(/\s+/g, ' ')
    .replace(/[\n\t\r]+/g, ' ')
    .trim();
}

const sections = [
  'web-projects',
  'misc-projects',
  'tutorials',
  'security',
  'devops'
];
const langs = ['en', 'fr'];
const indexes: {
  slug: string;
  content: string;
  categories: string[];
  date: string;
  metaDescription: string;
  metaKeywords: string;
}[] = [];
const data: Record<string, Record<string, Section>> = {
  en: JSON.parse(readFileSync('./db/en.json', 'utf8')),
  fr: JSON.parse(readFileSync('./db/fr.json', 'utf8'))
};
for (const section of sections) {
  for (const lang of langs) {
    Object.values(data[lang][section].items).forEach((item) => {
      if (item.visibility === false) return;

      const content = readFileSync(
        join('.', 'public', 'content', section, item.url, `${lang}.md`),
        'utf-8'
      );

      if (Buffer.from(content).byteLength >= 10000) {
        const markdown_section: string[] = [];
        const lines = content.split('\n');
        let section_content: string = '';
        for (const line of lines) {
          if (line.match(regexp)) {
            if (section_content.length > 0) {
              markdown_section.push(section_content);
              section_content = '';
            }
          }
          const sanitized_line = line;
          if (sanitized_line.length > 0) {
            section_content += sanitized_line + ' ';
          }
        }

        for (const split_section of markdown_section) {
          if (Buffer.from(split_section).byteLength >= 10000) {
            const sanitized = sanitize(split_section);
            const split_in_half: string[] = [
              sanitized.substring(0, sanitized.length / 2),
              sanitized.substring(sanitized.length / 2)
            ];

            for (const half of split_in_half) {
              indexes.push({
                slug: `/${lang}/${section}/${item.url}`,
                content: half,
                categories: item.categories,
                date: item.date,
                metaDescription: item.metaDescription,
                metaKeywords: item.metaKeywords
              });
            }
          } else {
            indexes.push({
              slug: `/${lang}/${section}/${item.url}`,
              content: sanitize(split_section),
              categories: item.categories,
              date: item.date,
              metaDescription: item.metaDescription,
              metaKeywords: item.metaKeywords
            });
          }
        }
      } else {
        indexes.push({
          slug: `/${lang}/${section}/${item.url}`,
          content: sanitize(content),
          categories: item.categories,
          date: item.date,
          metaDescription: item.metaDescription,
          metaKeywords: item.metaKeywords
        });
      }
    });
  }
}

writeFileSync('./index.json', JSON.stringify(indexes, null, 2));

Feel free to adjust the content according to your needs and writing style!


Search