diff --git a/Makefile b/Makefile index 7bcedbf8856f73fe0bb78af894cf6953dd98c0ed..305bd7a6c12dad17efffb1984de271cbe0c559ff 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,2 @@ all: - gcc -Wall -pedantic -g main.c class.c nfa.c dfa.c parse.c -o main \ No newline at end of file + gcc -Wall -pedantic -g main.c class.c nfa.c dfa.c parse.c regex.c -o main \ No newline at end of file diff --git a/dfa.c b/dfa.c index bfe8ace97b18863f3e3de0882374b807dbeb8ca4..c8fa4954f3be4b3eaab9f5c303dc88200b7e0209 100644 --- a/dfa.c +++ b/dfa.c @@ -3,7 +3,13 @@ #include <stdlib.h> #include <string.h> + +#ifdef DEBUG #include <stdio.h> +#define DEBUG_PRINT(...) printf(__VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif int int_cmp(const void *a, const void *b) { return *(int*)a - *(int*)b; @@ -18,6 +24,8 @@ typedef struct { } dfa_temp_node; dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { + DEBUG_PRINT("=====DFA CONSTRUCTION=====\n"); + int nfa_begin = nfa_count - 2; int nfa_end = nfa_count - 1; @@ -46,9 +54,10 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { while (p > 0) { int current_idx = dfa_stack[--p]; dfa_temp_node *current = &dfa_nodes[current_idx]; - printf("processing dfa[%d]; set={", current_idx); - for (int i = 0; i < current->n_set; i++) printf("%d,", current->set[i]); - printf("}\n"); + DEBUG_PRINT("processing dfa[%d]; set={", current_idx); + for (int i = 0; i < current->n_set; i++) + DEBUG_PRINT("%d,", current->set[i]); + DEBUG_PRINT("}\n"); int **edges = (int**)malloc(sizeof(int*)*0xFF); int *edge_set_lengths = (int*)malloc(sizeof(int)*0xFF); @@ -85,14 +94,14 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { symbol_idx = edge_count++; } //calculate reachable nfa node set for this symbol - printf("\ttransition set from nfa[%d] for symbol '%c': {", nfa_node_idx, symbol); + DEBUG_PRINT("\ttransition set from nfa[%d] for symbol '%c': {", nfa_node_idx, symbol); for (int f = 0; f < nfa_count; f++) { if (class_has(class_set[f], symbol)) { edges[symbol_idx][edge_set_lengths[symbol_idx]++] = f; - printf("%d,", f); + DEBUG_PRINT("%d,", f); } } - printf("}\n"); + DEBUG_PRINT("}\n"); } } int *transitions = (int*)malloc(sizeof(int)*edge_count); @@ -100,16 +109,22 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { int reduced_edge_count = 0; for (int i = 0; i < edge_count; i++) { qsort(edges[i], edge_set_lengths[i], sizeof(int), int_cmp); + DEBUG_PRINT("edge[%d]: %c -> {", i, edge_symbols[i]); + for (int k = 0; k < edge_set_lengths[i]; k++) { + DEBUG_PRINT("%d,", edges[i][k]); + } + DEBUG_PRINT("}, "); int target = -1; for (int k = 0; k < dfa_count; k++) { if (dfa_nodes[k].n_set != edge_set_lengths[i]) continue; - if (memcmp(dfa_nodes[k].set, edges[i], edge_set_lengths[i]) == 0) { //match + if (memcmp(dfa_nodes[k].set, edges[i], edge_set_lengths[i]*sizeof(int)) == 0) { //match + DEBUG_PRINT("reusing dfa_node dfa[%d]\n", k); target = k; break; } } if (target == -1) { - printf("\t\tpushing new dfa_node dfa[%d]\n", dfa_count); + DEBUG_PRINT("pushing new dfa_node dfa[%d]\n", dfa_count); dfa_nodes[dfa_count] = (dfa_temp_node){ edge_set_lengths[i], -1, edges[i], @@ -170,13 +185,10 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { return dfa_obj; } -dfa dfa_from_ast(ast_node *ast, int ast_count, int ast_root) { - int nfa_count = 2*ast_count; - int idx = 0; - nfa_node *nfa = (nfa_node*)malloc(sizeof(nfa_node)*nfa_count); - ast_to_nfa(nfa, ast, &idx, ast_root); - return nfa_to_dfa(nfa, nfa_count); -} -void dfa_free(dfa d) { - //... +void dfa_destroy(dfa d) { + for (int i = 0; i < d.n_nodes; i++) { + free(d.nodes[i].to); + free(d.nodes[i].what); + } + free(d.nodes); } \ No newline at end of file diff --git a/dfa.h b/dfa.h index 0ec5126683ad7ee56fa59640ed6eecff87e47a48..9759aa4d3817637d4f9db53cccc1119429822d4c 100644 --- a/dfa.h +++ b/dfa.h @@ -1,5 +1,5 @@ -#include "parse.h" +#include "nfa.h" #include <stdbool.h> typedef struct { @@ -15,5 +15,5 @@ typedef struct { int begin; } dfa; -dfa dfa_from_ast(ast_node *ast, int ast_count, int ast_root); -void dfa_free(dfa d); \ No newline at end of file +dfa nfa_to_dfa(nfa_node *nfa, int nfa_count); +void dfa_destroy(dfa d); \ No newline at end of file diff --git a/main.c b/main.c index 0eaf1e697cb83e94c44cdbd9687595f0dbc41088..61d3f6e5fd5258f4f8d2c5d7abcbc31c880ed750 100644 --- a/main.c +++ b/main.c @@ -1,40 +1,38 @@ #include "dfa.h" #include "parse.h" +#include "regex.h" #include <stdio.h> #include <stdlib.h> int main(int argc, char **argv) { - if (argc < 2) { + if (argc < 3) { printf("No regex argument found!\n"); return 1; } - char *r = argv[1]; - int ast_root, ast_count; - ast_node *parsed_ast = ast_from_str(r, &ast_count, &ast_root); - printf("Parsed regex '%s' [root=%d]:\n", r, ast_root); - for (int i = 0; i < ast_count; i++) { - ast_node a = parsed_ast[i]; - int out_len; - char* str = class_str(a.class, &out_len); - printf("\tast[%d] = {%c, %s, %d, %d}\n", i, a.op, str, a.a, a.b); - free(str); - } - - dfa d = dfa_from_ast(parsed_ast, ast_count, ast_root); - free(parsed_ast); + char *pattern = argv[1]; + char *text = argv[2]; + regex *r = regex_compile(pattern); - printf("--------\n"); - for (int i = 0; i < d.n_nodes; i++) { - dfa_node node = d.nodes[i]; - printf("dfa%c%c[%d] transitions: { ", (i == d.begin ? '>' : ' '), (node.end ? '!' : ' '), i); - for (int k = 0; k < node.edges; k++) { - int out_len; - char* str = class_str(node.what[k], &out_len); - printf("\"%s\":%d, ", str, node.to[k]); - free(str); + //printf("-------------\n"); + int match; + while(*text) { + match = regex_match(r, text); + if (match != -1) { + printf("{l=%d}[", match); + break; } - printf("}\n"); + printf("%c", *text); + text++; } + while (*text) { + if (match-- == 0) printf("]"); + printf("%c", *text); + text++; + } + if (match-- == 0) printf("]"); + printf("\n"); + + regex_destroy(r); } \ No newline at end of file diff --git a/nfa.h b/nfa.h index 33b1f830603a7125c92bc36d3979a5f3a7ad255c..825cb771959ce05c7890ff53c2c47455b1132368 100644 --- a/nfa.h +++ b/nfa.h @@ -1,3 +1,5 @@ +#ifndef _NFA_H_ +#define _NFA_H_ #include "class.h" #include "parse.h" @@ -8,4 +10,6 @@ typedef struct { } nfa_node; int ast_to_nfa(nfa_node *nfa, ast_node *ast, int *idx, int ast_idx); -void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set); \ No newline at end of file +void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set); + +#endif \ No newline at end of file diff --git a/regex.c b/regex.c index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b36217c024006a522a84e5324face894e2770fea 100644 --- a/regex.c +++ b/regex.c @@ -0,0 +1,107 @@ +#include "dfa.h" +#include "nfa.h" +#include <stdlib.h> +#include <stdio.h> + +typedef struct { + dfa dfa; +} regex; + +void dump_dfa(dfa d) { + printf("=====DFA=====\n"); + for (int i = 0; i < d.n_nodes; i++) { + dfa_node node = d.nodes[i]; + printf("dfa%c%c[%d] transitions: { ", (i == d.begin ? '>' : ' '), (node.end ? '!' : ' '), i); + for (int k = 0; k < node.edges; k++) { + int out_len; + char* str = class_str(node.what[k], &out_len); + printf("\"%s\":%d, ", str, node.to[k]); + free(str); + } + printf("}\n"); + } +} + +void dump_ast(ast_node *ast, int ast_count) { + printf("=====AST=====\n"); + for (int i = 0; i < ast_count; i++) { + ast_node n = ast[i]; + if (n.op != '\0') + printf("ast[%d]={ %c %d %d }\n", i, n.op, n.a, n.b); + else { + int out_len; + char* str = class_str(n.class, &out_len); + printf("ast[%d]=[%s]\n", i, str); + free(str); + } + } +} + +void dump_nfa(nfa_node *nfa, int nfa_count) { + printf("=====NFA=====\n"); + for (int i = 0; i < nfa_count; i++) { + nfa_node n = nfa[i]; + if (class_any(n.class)) { + int out_len; + char* str = class_str(n.class, &out_len); + printf("nfa[%d]={ [%s] -> %d }\n", i, str, n.a); + free(str); + } else if (n.b == -1) { + printf("nfa[%d]={ %d }\n", i, n.a); + } else { + printf("nfa[%d]={ %d %d }\n", i, n.a, n.b); + } + } +} + +regex *regex_compile(char* pattern) { + //ast + int ast_root, ast_count; + ast_node *parsed_ast = ast_from_str(pattern, &ast_count, &ast_root); + //dump_ast(parsed_ast, ast_count); + + //nfa + int nfa_count = 2*ast_count; + int idx = 0; + nfa_node *nfa = (nfa_node*)malloc(sizeof(nfa_node)*nfa_count); + ast_to_nfa(nfa, parsed_ast, &idx, ast_root); + free(parsed_ast); + //dump_nfa(nfa, nfa_count); + + //dfa + dfa d = nfa_to_dfa(nfa, nfa_count); + free(nfa); + //dump_dfa(d); + + //regex + regex *r = (regex*)malloc(sizeof(regex)); + r->dfa = d; + return r; +} + +void regex_destroy(regex *pattern) { + dfa_destroy(pattern->dfa); + free(pattern); +} + +int regex_match(regex *pattern, char *str){ + dfa d = pattern->dfa; + int idx = 0; + int state = d.begin; + if (d.nodes[state].end) return 0; + char c; + while ((c = str[idx++])) { + dfa_node *n = &d.nodes[state]; + bool s = false; + for (int i = 0; i < n->edges; i++) { + if (class_has(n->what[i], c)) { + state = n->to[i]; + if (d.nodes[state].end) return idx; + s = true; + break; + } + } + if (!s) return -1; + } + return -1; +} \ No newline at end of file diff --git a/regex.h b/regex.h index 6aace47d7bec6d2568136cf53fcfa754f384374e..3a8854f06a1a8321f8f96823bfeb2913dfc66499 100644 --- a/regex.h +++ b/regex.h @@ -1,8 +1,6 @@ - -typedef struct { - -} regex; +struct regex; +typedef struct regex regex; regex *regex_compile(char* pattern); -void regex_destroy(regex *pattern) -bool regex_match(regex *pattern, char *str); +void regex_destroy(regex *pattern); +int regex_match(regex *pattern, char *str); diff --git a/spec.tex b/spec.tex index 1f0090e591717ff7fe17082d8d602b719e5a772a..fa80957291bfd5e82d2a48800d0e37753ef6985e 100644 --- a/spec.tex +++ b/spec.tex @@ -2,7 +2,7 @@ \documentclass{article} \usepackage{amsfonts} -\title{Reguláris kifejezések} +\title{Reguláris kifejezések specifikáció} \author{Kurucz György} \begin{document}