lexer done?

2024-01-02 20:53:15 +01:00 · 2024-01-02 20:53:15 +01:00 · ab044c1126
parent 023c39c1f1
commit ab044c1126
4 changed files with 149 additions and 52 deletions
--- a/2
+++ b/2
@ -2,7 +2,7 @@ BIN=webpage
 SRCS=main.c config.c page.c lexer.c
 INC=.
 LIB=curl
-FLAGS=-Wall -Wextra -Og -g
+FLAGS=-Wall -Wextra -Og -g -ggdb -fvar-tracking
 all: $(BIN)
--- a/lexer.c
+++ b/lexer.c
@ -1,17 +1,40 @@
 #include "lexer.h"
 #define HTML_BALISE_LEN 12
 typedef struct Cursor {
  int chunk;
  int offset;
 } Cursor;
 Cursor cursor = {
  .chunk = 0,
-  .offset = 0,
+  .offset = -1,
 };
-char* nextchar(void){
+Cursor prev = {
-  if (page.chunks[cursor.chunk][cursor.offset+1] == '\0'){
+  .chunk = 0,
-    cursor.chunk++;
+  .offset = -2,
-    cursor.offset = 0;
+};
 void increment_cursor(Cursor* cur){
  if (cur->offset+1 < 0){
    printf("init prev cursor.\n");
    cur->offset++;
    return;
  }
  if (page.chunks[cur->chunk][cur->offset+1] == '\0'){
    cur->chunk++;
    cur->offset = 0;
  } else {
-    cursor.offset++;
+    cur->offset++;
-  } 
+  }
 }
 char* nextchar(void){
  increment_cursor(&cursor);
  increment_cursor(&prev);
  if (cursor.chunk >= page.len){
    return NULL;
@ -20,48 +43,111 @@ char* nextchar(void){
  return &page.chunks[cursor.chunk][cursor.offset];
 }
-char* HTMLbalise(void){
+void go_back(void){
-  return NULL;
+  cursor.chunk = prev.chunk;
  cursor.offset = prev.offset;
  prev.offset--;
  printf("got back mf.\n");
  if (cursor.chunk < 0 || cursor.offset < 0){
    printf("chunk = %d | offset= %d\n", cursor.chunk, cursor.offset);
    puts("ERROR: go way too back.");
    exit(1);
  }
 }
 TokenType token_by_name(const char name[HTML_BALISE_LEN]){
  if (name == NULL){
    return UNDEFINED_TYPE;
  } else if (strncmp(name, "body", HTML_BALISE_LEN) == 0){
    return BODY;
  } else if (strncmp(name, "/body", HTML_BALISE_LEN) == 0){
    return END_BODY;
  } else if (strncmp(name, "ul", HTML_BALISE_LEN) == 0){
    return UL;
  } else if (strncmp(name, "li", HTML_BALISE_LEN) == 0){
    return LI;
  } else if (strncmp(name, "h1", HTML_BALISE_LEN) == 0){
    return H1;
  } else if (strncmp(name, "h2", HTML_BALISE_LEN) == 0){
    return H2;
  } else if (strncmp(name, "h3", HTML_BALISE_LEN) == 0){
    return H3;
  } else if (strncmp(name, "h4", HTML_BALISE_LEN) == 0){
    return H4;
  } else if (strncmp(name, "h5", HTML_BALISE_LEN) == 0){
    return H5;
  } else if (strncmp(name, "h6", HTML_BALISE_LEN) == 0){
    return H6;
  }
  return DONT_CARE;
 }
 Token* nexttoken(void){
-  Token* token = malloc(sizeof(Token));
+  Token* token = NULL;
-  token->value = NULL;
+  static char* cursor = NULL;
  cursor = nextchar();
-  char* word = HTMLbalise();
+  while (*cursor == '\n'){
-  if (word == NULL){
+    printf("new line skipped.\n");
-    token->type = NO_TYPE;
+    cursor = nextchar();
-  } else if (strcmp(word, "body") == 0){
+  }
-    token->type = BODY;
+
-  } else if (strcmp(word, "/body") == 0){
+  if (*cursor != '<'){
-    token->type = END_BODY;
+    int i = 0;
-  } else if (strcmp(word, "ul") == 0){
+
-    token->type = UL;
+    do {
-  } else if (strcmp(word, "li") == 0){
+      cursor = nextchar();
-    token->type = LI;
+      i++;
-  } else if (strcmp(word, "h1") == 0){
+      printf("text: '%c'.\n", *cursor);
-    token->type = H1;
+    } while (*cursor != '<');
-  } else if (strcmp(word, "h2") == 0){
+    go_back();
-    token->type = H2;
+
-  } else if (strcmp(word, "h3") == 0){
+    token = malloc(sizeof(Token));
-    token->type = H3;
+    token->type = TEXT;
-  } else if (strcmp(word, "h4") == 0){
+    token->value = "TODO";
-    token->type = H4;
+    token->len = i;
-  } else if (strcmp(word, "h5") == 0){
+  } else if (*cursor == '<'){
-    token->type = H5;
+    char balise[HTML_BALISE_LEN] = {0};
-  } else if (strcmp(word, "h6") == 0){
+    int len = 0;
-    token->type = H6;
+
-  } else {
+    cursor = nextchar();
-    token->type = NO_TYPE;
+    while (*cursor != '>' && *cursor != ' ' && len < HTML_BALISE_LEN){
      balise[len] = *cursor;
      len++;
      printf("balise: '%c'.\n", *cursor);
      cursor = nextchar();
    }
    token = malloc(sizeof(Token));
    token->type = token_by_name(balise);
    token->value = malloc(sizeof(char) * len);
    strncpy(token->value, balise, len+1);
    token->len = len;
    do {
      cursor = nextchar();
      printf("skip: '%c'.\n", *cursor);
    } while (*cursor != '>');
  }
  return token;
 }
 void printtoken(Token* token){
  if (token == NULL){
    puts("NULL TOKEN");
    return;
  }
  switch (token->type) {
-    case NO_TYPE:
+    case UNDEFINED_TYPE:
-      printf("NO_TYPE: ");
+      printf("UNDEFINED_TYPE: ");
      break;
    case DONT_CARE:
      printf("DONT_CARE: ");
      break;
    case TEXT:
      printf("TEXT: ");
@ -97,14 +183,27 @@ void printtoken(Token* token){
      printf("H6: ");
      break;
    default:
-      printf("UNDEFINED TOKEN: ");
+      printf("ERROR: UNKNOWN TOKEN: ");
      break;
  }
  if (token->value == NULL){
-    puts("NO VALUE FOUND");
+    puts("'NO VALUE FOUND'");
    return;
  }
-  printf("%s\n", token->value);
+  printf("'%s'\n", token->value);
 }
 #if 0
 <!DOCTYPE html>
 <html lang="en">
  <head>
    <title>https://n3m0.fr/fr/</title>
    <link rel="canonical" href="https://n3m0.fr/fr/">
    <meta name="robots" content="noindex">
    <meta charset="utf-8">
    <meta http-equiv="refresh" content="0; url=https://n3m0.fr/fr/">
  </head>
 </html>
 #endif
--- a/lexer.h
+++ b/lexer.h
@ -8,7 +8,8 @@
 #include "page.h"
 typedef enum TokenType {
-  NO_TYPE,
+  UNDEFINED_TYPE,
  DONT_CARE,
  TEXT,
  BODY,
  END_BODY,
@ -19,13 +20,9 @@ typedef enum TokenType {
 typedef struct Token {
  TokenType type;
  char* value;
  int len;
 } Token;
 typedef struct Cursor {
  int chunk;
  int offset;
 } Cursor;
 Token* nexttoken(void);
 void printtoken(Token* token);
--- a/main.c
+++ b/main.c
@ -9,12 +9,13 @@ int main(int argc, char* argv[]){
  printPage();
-  Token* token;
+  Token* token = NULL;
-  do{
+  do {
    token = nexttoken();
    printtoken(token);
-    //parse(token);
+  } while (token != NULL && token->type != UNDEFINED_TYPE);
-  } while (token->type != NO_TYPE);
+
  //parse(token);
  return 0;
 }