webpage/lexer.c

332 lines
8.3 KiB
C
Raw Permalink Normal View History

2023-12-31 14:54:14 +01:00
#include "lexer.h"
2024-01-02 20:53:15 +01:00
#define HTML_BALISE_LEN 12
2024-01-04 14:56:56 +01:00
#define DA_LEN 64
2024-01-02 20:53:15 +01:00
typedef struct Cursor {
int chunk;
int offset;
} Cursor;
2024-01-02 22:33:34 +01:00
Cursor curr = {
2023-12-31 14:54:14 +01:00
.chunk = 0,
2024-01-02 20:53:15 +01:00
.offset = -1,
2023-12-31 14:54:14 +01:00
};
2024-01-02 20:53:15 +01:00
Cursor prev = {
.chunk = 0,
.offset = -2,
};
2024-01-03 17:41:19 +01:00
TokenType token_by_name(const char name[HTML_BALISE_LEN]);
2024-01-02 22:44:42 +01:00
void printtoken(Token* token){
if (token == NULL){
puts("NULL TOKEN");
return;
}
2024-01-03 03:22:27 +01:00
printf("%d: ", token->type);
2024-01-02 22:44:42 +01:00
if (token->value == NULL){
puts("'NO VALUE FOUND'");
return;
}
printf("'%s'\n", token->value);
}
2024-01-02 22:33:34 +01:00
int increment_cursor(Cursor* cursor){
if (cursor->offset+1 < 0){
cursor->offset++;
2024-01-02 21:44:27 +01:00
return 0;
2024-01-02 20:53:15 +01:00
}
2024-01-02 22:33:34 +01:00
if (page.chunks[cursor->chunk][cursor->offset+1] == '\0'){
cursor->chunk++;
cursor->offset = 0;
2023-12-31 14:54:14 +01:00
} else {
2024-01-02 22:33:34 +01:00
cursor->offset++;
2024-01-02 20:53:15 +01:00
}
2024-01-02 21:44:27 +01:00
2024-01-02 22:33:34 +01:00
if (cursor->chunk >= page.len){
2024-01-02 21:44:27 +01:00
return -1;
}
return 0;
2024-01-02 20:53:15 +01:00
}
char* nextchar(void){
2024-01-02 22:33:34 +01:00
if (increment_cursor(&curr) < 0 || increment_cursor(&prev) < 0){
2024-01-02 21:44:27 +01:00
return NULL;
}
2023-12-31 14:54:14 +01:00
2024-01-02 22:33:34 +01:00
if (curr.chunk >= page.len){
return NULL;
}
2024-01-02 22:33:34 +01:00
return &page.chunks[curr.chunk][curr.offset];
2023-12-31 14:54:14 +01:00
}
2024-01-02 20:53:15 +01:00
void go_back(void){
2024-01-02 22:33:34 +01:00
curr.chunk = prev.chunk;
curr.offset = prev.offset;
2024-01-02 20:53:15 +01:00
prev.offset--;
2024-01-02 22:33:34 +01:00
if (curr.chunk < 0 || curr.offset < 0){
2024-01-03 19:08:22 +01:00
puts("ERROR: cursor got way too back.");
2024-01-02 20:53:15 +01:00
exit(1);
}
}
2024-01-03 16:17:52 +01:00
char* getParam(const char* word, int len, char* cursor, int* size){
2024-01-03 20:00:08 +01:00
char* res = NULL;
int found = 0;
len--;
2024-01-03 16:17:52 +01:00
do {
2024-01-03 20:00:08 +01:00
cursor = nextchar();
for (int i=0; i<len; i++){
if (word[i] != *cursor){
found = -1;
2024-01-03 16:17:52 +01:00
break;
2024-01-03 20:00:08 +01:00
} else {
found = 1;
cursor = nextchar();
2024-01-03 16:17:52 +01:00
}
}
2024-01-03 20:00:08 +01:00
if (found == 1){
while (*cursor != '"'){
cursor = nextchar();
}
2024-01-04 14:56:56 +01:00
int cap = DA_LEN;
res = malloc(sizeof(char) * cap);
2024-01-03 16:17:52 +01:00
*size = 0;
2024-01-03 20:00:08 +01:00
2024-01-03 16:17:52 +01:00
do {
2024-01-03 20:00:08 +01:00
cursor = nextchar();
2024-01-04 14:56:56 +01:00
res[*size] = *cursor;
2024-01-03 16:17:52 +01:00
(*size)++;
2024-01-04 14:56:56 +01:00
if (*size >= cap){
cap *= 2;
res = realloc(res, cap);
}
2024-01-03 16:17:52 +01:00
} while (*cursor != '"');
2024-01-03 17:41:19 +01:00
2024-01-04 14:56:56 +01:00
res[*size-1] = '\0';
2024-01-03 20:00:08 +01:00
break;
}
2024-01-03 16:17:52 +01:00
} while (*cursor != '>');
2024-01-03 20:00:08 +01:00
return res;
2024-01-03 16:17:52 +01:00
}
2024-01-02 22:44:42 +01:00
Token* create_text_token(Token* token, char* cursor){
2024-01-04 15:02:39 +01:00
token = malloc(sizeof(Token));
token->value = malloc(sizeof(char) * DA_LEN);
int i = 0, cap = DA_LEN;
2024-01-02 22:33:34 +01:00
2024-01-04 15:02:39 +01:00
go_back();
2024-01-02 22:33:34 +01:00
do {
cursor = nextchar();
2024-01-04 15:02:39 +01:00
token->value[i] = *cursor;
2024-01-02 22:33:34 +01:00
i++;
2024-01-04 15:02:39 +01:00
if (i >= cap){
cap *= 2;
token->value = realloc(token->value, cap);
}
2024-01-02 22:33:34 +01:00
} while (*cursor != '<');
2024-01-04 15:02:39 +01:00
token->value[i-1] = '\0';
2024-01-02 22:33:34 +01:00
go_back();
token->type = TEXT;
2024-01-04 15:02:39 +01:00
token->len = i-1;
2024-01-02 22:44:42 +01:00
return token;
2024-01-02 22:33:34 +01:00
}
2024-01-02 22:44:42 +01:00
Token* create_balise_token(Token* token, char* cursor){
2024-01-02 22:33:34 +01:00
char balise[HTML_BALISE_LEN] = {0};
int len = 0;
cursor = nextchar();
while (*cursor != '>' && *cursor != ' ' && *cursor != '\n' && *cursor != '\t' && len < HTML_BALISE_LEN){
balise[len] = *cursor;
len++;
cursor = nextchar();
}
token = malloc(sizeof(Token));
token->type = token_by_name(balise);
2024-01-03 16:17:52 +01:00
if (token->type == A){
token->value = getParam("href", sizeof("href"), cursor, &token->len);
2024-01-04 14:56:56 +01:00
} else if (token->type == IMG) {
int srclen, altlen, totallen;
2024-01-03 16:17:52 +01:00
char* src = getParam("src", sizeof("src"), cursor, &srclen);
char* alt = getParam("alt", sizeof("alt"), cursor, &altlen);
2024-01-04 14:56:56 +01:00
totallen = srclen * altlen + 1;
token->value = malloc(sizeof(char) * (totallen));
strncpy(token->value, src, srclen);
token->value[srclen-1] = ' ';
strncpy(token->value+srclen, alt, altlen);
token->value[totallen] = '\0';
token->len = totallen;
} else {
2024-01-03 19:08:22 +01:00
token->value = malloc(sizeof(char) * len);
strncpy(token->value, balise, len+1);
token->len = len;
}
2024-01-03 16:17:52 +01:00
2024-01-03 19:08:22 +01:00
go_back();
2024-01-02 22:33:34 +01:00
do {
cursor = nextchar();
} while (*cursor != '>');
2024-01-02 22:44:42 +01:00
return token;
2024-01-02 22:33:34 +01:00
}
2023-12-31 14:54:14 +01:00
Token* nexttoken(void){
2024-01-02 20:53:15 +01:00
Token* token = NULL;
static char* cursor = NULL;
cursor = nextchar();
2024-01-02 21:44:27 +01:00
if (cursor == NULL) return NULL;
while (*cursor == '\0' || *cursor == ' ' || *cursor == '\n' || *cursor == '\t'){
2024-01-02 20:53:15 +01:00
cursor = nextchar();
}
if (*cursor != '<'){
2024-01-02 22:44:42 +01:00
token = create_text_token(token, cursor);
2024-01-02 20:53:15 +01:00
} else if (*cursor == '<'){
2024-01-02 22:44:42 +01:00
token = create_balise_token(token, cursor);
2024-01-01 19:13:39 +01:00
}
2023-12-31 14:54:14 +01:00
return token;
}
2024-01-03 17:41:19 +01:00
TokenType token_by_name(const char name[HTML_BALISE_LEN]){
if (name == NULL){
return UNDEFINED_TYPE;
} else if (strncmp(name, "body", HTML_BALISE_LEN) == 0){
return BODY;
} else if (strncmp(name, "/body", HTML_BALISE_LEN) == 0){
return END_BODY;
} else if (strncmp(name, "/html", HTML_BALISE_LEN) == 0){
return END_HTML;
2024-01-04 19:36:30 +01:00
} else if (strncmp(name, "title", HTML_BALISE_LEN) == 0){
return TITLE;
} else if (strncmp(name, "/title", HTML_BALISE_LEN) == 0){
return END_TITLE;
2024-01-03 17:41:19 +01:00
} else if (strncmp(name, "em", HTML_BALISE_LEN) == 0){
return EM;
} else if (strncmp(name, "/em", HTML_BALISE_LEN) == 0){
return END_EM;
} else if (strncmp(name, "i", HTML_BALISE_LEN) == 0){
return I;
} else if (strncmp(name, "/i", HTML_BALISE_LEN) == 0){
return END_I;
} else if (strncmp(name, "b", HTML_BALISE_LEN) == 0){
return B;
} else if (strncmp(name, "/b", HTML_BALISE_LEN) == 0){
return END_B;
} else if (strncmp(name, "strong", HTML_BALISE_LEN) == 0){
return STRONG;
} else if (strncmp(name, "/strong", HTML_BALISE_LEN) == 0){
return END_STRONG;
} else if (strncmp(name, "hr", HTML_BALISE_LEN) == 0){
return HR;
} else if (strncmp(name, "br", HTML_BALISE_LEN) == 0){
return BR;
2024-01-04 19:58:12 +01:00
} else if (strncmp(name, "p", HTML_BALISE_LEN) == 0){
return P;
2024-01-04 19:36:30 +01:00
} else if (strncmp(name, "/p", HTML_BALISE_LEN) == 0){
return END_P;
2024-01-03 17:41:19 +01:00
} else if (strncmp(name, "a", HTML_BALISE_LEN) == 0){
return A;
} else if (strncmp(name, "/a", HTML_BALISE_LEN) == 0){
return END_A;
} else if (strncmp(name, "ol", HTML_BALISE_LEN) == 0){
return OL;
} else if (strncmp(name, "/ol", HTML_BALISE_LEN) == 0){
return END_OL;
} else if (strncmp(name, "ul", HTML_BALISE_LEN) == 0){
return UL;
} else if (strncmp(name, "/ul", HTML_BALISE_LEN) == 0){
return END_UL;
} else if (strncmp(name, "li", HTML_BALISE_LEN) == 0){
return LI;
} else if (strncmp(name, "/li", HTML_BALISE_LEN) == 0){
return END_LI;
} else if (strncmp(name, "img", HTML_BALISE_LEN) == 0){
return IMG;
} else if (strncmp(name, "/img", HTML_BALISE_LEN) == 0){
return END_IMG;
} else if (strncmp(name, "blockquote", HTML_BALISE_LEN) == 0){
return BLOCKQUOTE;
} else if (strncmp(name, "/blockquote", HTML_BALISE_LEN) == 0){
return END_BLOCKQUOTE;
} else if (strncmp(name, "code", HTML_BALISE_LEN) == 0){
return CODE;
} else if (strncmp(name, "/code", HTML_BALISE_LEN) == 0){
return END_CODE;
} else if (strncmp(name, "h1", HTML_BALISE_LEN) == 0){
return H1;
} else if (strncmp(name, "h2", HTML_BALISE_LEN) == 0){
return H2;
} else if (strncmp(name, "h3", HTML_BALISE_LEN) == 0){
return H3;
} else if (strncmp(name, "h4", HTML_BALISE_LEN) == 0){
return H4;
} else if (strncmp(name, "h5", HTML_BALISE_LEN) == 0){
return H5;
} else if (strncmp(name, "h6", HTML_BALISE_LEN) == 0){
2024-01-04 19:36:30 +01:00
return H6;
} else if (strncmp(name, "/h1", HTML_BALISE_LEN) == 0){
return END_H1;
} else if (strncmp(name, "/h2", HTML_BALISE_LEN) == 0){
return END_H2;
} else if (strncmp(name, "/h3", HTML_BALISE_LEN) == 0){
return END_H3;
} else if (strncmp(name, "/h4", HTML_BALISE_LEN) == 0){
return END_H4;
} else if (strncmp(name, "/h5", HTML_BALISE_LEN) == 0){
return END_H5;
} else if (strncmp(name, "/h6", HTML_BALISE_LEN) == 0){
2024-01-03 17:41:19 +01:00
return H6;
} else if (strncmp(name, "table", HTML_BALISE_LEN) == 0){
return TABLE;
} else if (strncmp(name, "/table", HTML_BALISE_LEN) == 0){
return END_TABLE;
} else if (strncmp(name, "thead", HTML_BALISE_LEN) == 0){
return THEAD;
} else if (strncmp(name, "/thead", HTML_BALISE_LEN) == 0){
return END_THEAD;
} else if (strncmp(name, "tbody", HTML_BALISE_LEN) == 0){
return TBODY;
} else if (strncmp(name, "/tbody", HTML_BALISE_LEN) == 0){
return END_TBODY;
} else if (strncmp(name, "tr", HTML_BALISE_LEN) == 0){
return TR;
} else if (strncmp(name, "/tr", HTML_BALISE_LEN) == 0){
return END_TR;
} else if (strncmp(name, "th", HTML_BALISE_LEN) == 0){
return TH;
} else if (strncmp(name, "/th", HTML_BALISE_LEN) == 0){
return END_TH;
} else if (strncmp(name, "progress", HTML_BALISE_LEN) == 0){
return PROGRESS;
} else if (strncmp(name, "/progress", HTML_BALISE_LEN) == 0){
return END_PROGRESS;
}
return DONT_CARE;
}