A Simple Script to Index Markdown Content with Algolia
I decided to explore how I can make my blog more discoverable using Algolia. In this post, I’ll share a simple script that indexes my markdown content and sends it to Algolia.
The Challenge
Algolia has a limit of 10KB per record, which means we need to find ways to parse and transform our markdown content into manageable chunks. This is where this script comes in – it takes care of splitting sections and indexing data for search engines like Algolia.
How It Works
The script uses Deno as a runtime and node.js libraries to read and parse markdown files.
It splits content into sections, sanitizes text, and indexes data in the index.json
format I decided to use.
This script can be adapted to work on other websites or with different types of content.
The script
// deno run -A tools/indexer.ts
// 2024-10-27 to generate the index.json
import { writeFileSync, readFileSync } from 'node:fs';
import { join } from 'node:path';
import { Buffer } from 'node:buffer';
const regexp = /^#+/;
type Article = {
visibility: boolean;
categories: string[];
links: string[];
author: string;
initials: string;
date: string;
color: string;
company: string;
metaDescription: string;
metaKeywords: string;
title: string;
subtitle: string;
text: string;
url: string;
tags: { icon: string; text: string }[];
};
type Section = {
title: string;
description: string;
metaDescription: string;
metaKeywords: string;
items: Article[];
};
function sanitize(txt: string) {
return txt
.replace(/\s+/g, ' ')
.replace(/[\n\t\r]+/g, ' ')
.trim();
}
const sections = [
'web-projects',
'misc-projects',
'tutorials',
'security',
'devops'
];
const langs = ['en', 'fr'];
const indexes: {
slug: string;
content: string;
categories: string[];
date: string;
metaDescription: string;
metaKeywords: string;
}[] = [];
const data: Record<string, Record<string, Section>> = {
en: JSON.parse(readFileSync('./db/en.json', 'utf8')),
fr: JSON.parse(readFileSync('./db/fr.json', 'utf8'))
};
for (const section of sections) {
for (const lang of langs) {
Object.values(data[lang][section].items).forEach((item) => {
if (item.visibility === false) return;
const content = readFileSync(
join('.', 'public', 'content', section, item.url, `${lang}.md`),
'utf-8'
);
if (Buffer.from(content).byteLength >= 10000) {
const markdown_section: string[] = [];
const lines = content.split('\n');
let section_content: string = '';
for (const line of lines) {
if (line.match(regexp)) {
if (section_content.length > 0) {
markdown_section.push(section_content);
section_content = '';
}
}
const sanitized_line = line;
if (sanitized_line.length > 0) {
section_content += sanitized_line + ' ';
}
}
for (const split_section of markdown_section) {
if (Buffer.from(split_section).byteLength >= 10000) {
const sanitized = sanitize(split_section);
const split_in_half: string[] = [
sanitized.substring(0, sanitized.length / 2),
sanitized.substring(sanitized.length / 2)
];
for (const half of split_in_half) {
indexes.push({
slug: `/${lang}/${section}/${item.url}`,
content: half,
categories: item.categories,
date: item.date,
metaDescription: item.metaDescription,
metaKeywords: item.metaKeywords
});
}
} else {
indexes.push({
slug: `/${lang}/${section}/${item.url}`,
content: sanitize(split_section),
categories: item.categories,
date: item.date,
metaDescription: item.metaDescription,
metaKeywords: item.metaKeywords
});
}
}
} else {
indexes.push({
slug: `/${lang}/${section}/${item.url}`,
content: sanitize(content),
categories: item.categories,
date: item.date,
metaDescription: item.metaDescription,
metaKeywords: item.metaKeywords
});
}
});
}
}
writeFileSync('./index.json', JSON.stringify(indexes, null, 2));
Feel free to adjust the content according to your needs and writing style!