I am trying to work with ACE source code (to do some experiments with the supertaggers I trained for the ERG), and I am not very experienced with C. As a result, I am finding myself quite confused, and I was hoping someone (@sweaglesw ?) might help.
I am very confused about how the ubertagging option gets triggered and used by the parser. Below my two typical test cases, one without the ubertagging option and using the grammar image which did not have access to the ubertagging model paths, and the second one with the ubertagging option and with the appropriately compiled grammar:
./ace -g ~/delphin/erg/trunk/ace/erg.dat sentences.txt
./ace -g ~/delphin/erg/trunk/ace/english-ut.dat sentences.txt --ubertagging=0.0001
My goal is to be able to step through the program execution in both cases, so that I can understand the differences well and then after that add my own code for the new supertagger option.
Now, I am confused about what is going on. Below the code which is responsible for using the ubertagger:
#include "ubertag.h"
extern struct ubertagger *the_ubertagger;
extern int enable_ubertagging;
char *st_file = "supertags.txt";
printf("Ubertagging enabled: %f\n",enable_ubertagging);
if(g_profiling)start_and_alloc_profiler(&ubertagging_profiler, "übertagging", parse_profiler, lexical_filtering_profiler);
if(the_ubertagger && enable_ubertagging)
ubertag_lattice(the_ubertagger, lexical_chart, log(ubertagging_threshold));
printf("Lexical chart after ubertagging:\n");
print_lexical_chart(lexical_chart);
The above code is part of a function called parse_with_token_chart. I paste the whole function below. Note the unusual (?) indentation of the code which includes the ubertagger header file. It is in the middle of the function but it is indented as if it were external to the function scope (maybe because the extern
keyword is used? I am not familiar with this…)
int parse_with_token_chart(struct lattice *token_chart, clock_t start)
{
int i, count, cforest;
int num_parses = 0, num_entries;
float t_setup, t_forest, t_unpack;
struct dg *root;
struct dg *sem;
struct edge *edge;
if(do_itsdb)itsdb_dump_tokens(":p-input", ":ninputs", token_chart);
apply_token_mapping(token_chart);
sort_lattice(token_chart);
if(!yy_mode && do_improve_characterization)
improve_token_lattice_characterization(token_chart);
if(token_chart->nvertices>give_up_threshold)
{
fprintf(stderr, "NOTE: giving up, too many words (%d+)\n", give_up_threshold);
itsdb_error("too many words");
if(do_itsdb)itsdb_report_parse(token_chart, NULL, -2, 0);
stop_timer(chart_setup_timer, 1);
return -2;
}
if(trace)print_token_chart(token_chart);
if(trace)printf("finished token mapping\n");
if(g_profiling)start_and_alloc_profiler(&lexical_lookup_profiler, "lexical lookup", parse_profiler, token_mapping_profiler);
// do lexical lookup
struct lattice *lexical_chart = lexical_lookup_into_chart(token_chart);
if(!lexical_chart)
{
fprintf(stderr, "NOTE: failed to build lexical chart\n");
stop_timer(chart_setup_timer, 1);
return -1;
}
if(trace)printf("finished lexical lookup\n");
if(g_profiling)start_and_alloc_profiler(&lexical_parsing_profiler, "lexical parsing", parse_profiler, lexical_lookup_profiler);
extern int chart_size;
chart_size = 0; // set to a predictable value rather than leaving it whatever the last sentence had; used in deciding whether an edge is spanning or not, as early as lexical parsing, for deciding whether the rule_root_utc[] is applicable.
extern int reduce_chart_before_lexical_parsing;
if(!reduce_chart_before_lexical_parsing)
{
// do lexical parsing
int rv = lexical_parse_lattice(lexical_chart);
if(rv == -1)
{
itsdb_error("ran out of RAM in lexical parsing");
return -1; // ran out of memory in lexical parsing!
}
}
if(reduce_lexical_lattice(lexical_chart, token_chart))
{
fprintf(stderr, "NOTE: post reduction gap\n");
stop_timer(chart_setup_timer, 1);
if(do_itsdb)itsdb_report_parse(token_chart, lexical_chart, -2, 0);
return -2;
}
if(reduce_chart_before_lexical_parsing)
lexical_parse_lattice(lexical_chart);
if(trace)print_lexical_chart(lexical_chart);
if(trace)printf("finished lexical parsing\n");
if(g_profiling)start_and_alloc_profiler(&lexical_filtering_profiler, "lexical filtering", parse_profiler, lexical_parsing_profiler);
// do lexical filtering
apply_lexical_filtering(lexical_chart);
if(trace)printf("finished lexical filtering\n");
#include "ubertag.h"
extern struct ubertagger *the_ubertagger;
extern int enable_ubertagging;
char *st_file = "supertags.txt";
//printf("%f\n",enable_ubertagging);
if(g_profiling)start_and_alloc_profiler(&ubertagging_profiler, "übertagging", parse_profiler, lexical_filtering_profiler);
if(the_ubertagger && enable_ubertagging)
ubertag_lattice(the_ubertagger, lexical_chart, log(ubertagging_threshold));
printf("Lexical chart after ubertagging:\n");
print_lexical_chart(lexical_chart);
if(g_profiling)start_and_alloc_profiler(&chart_parsing_profiler, "chart parsing", parse_profiler, ubertagging_profiler);
// XXX far-fetched experiment...
//first_pass_parse(lexical_chart);
//return 1;
// lexemes have no packing restrictor applied to them thus far.
// make sure all copy()'s from here on out (until unpacking time) are restricting.
if(!inhibit_pack)enable_packing(1);
//print_lexical_chart(lexical_chart);
struct supertagger *the_supertagger = load_supertagger(st_file); //OZ: This is new code; does nothing at the moment.
// setup the main parse chart
int nwords = prepare_parse_chart(token_chart, lexical_chart);
if(nwords < 0)
{
fprintf(stderr, "NOTE: negative lexical chart length\n");
itsdb_error("negative lexical chart length");
if(do_itsdb)itsdb_report_parse(token_chart, lexical_chart, -2, 0);
stop_timer(chart_setup_timer, 1);
enable_packing(0);
return -2;
}
assert(nwords >= 0);
if(!nwords)
{
fprintf(stderr, "NOTE: nothing to parse\n");
itsdb_error("nothing to parse");
if(do_itsdb)itsdb_report_parse(token_chart, lexical_chart, -2, 0);
stop_timer(chart_setup_timer, 1);
enable_packing(0);
return -2;
}
// XXX experimental chart-pruning-like idea
predict_rule_uses(nwords, lexical_chart);
cancel_task = 0;
did_timeout = 0;
signal(SIGALRM, alarm_handler);
if(timeout_seconds > 0)
alarm(timeout_seconds);
stop_timer(chart_setup_timer, 1);
static int chart_parsing_timer = -1;
if(chart_parsing_timer == -1)chart_parsing_timer = new_timer("chart parsing");
start_timer(chart_parsing_timer);
t_setup = timer();
// do the actual parsing
int half = nwords/2;
double t_half;
while( !cancel_task && (edge = next_agenda(0)) )
{
if(edge->to-edge->from >= half)
{
half = nwords+2;
t_half = timer();
//halftime_analysis();
}
if(parse_process_edge(edge))break;
}
compact_generalization_edges();
enable_packing(0);
stop_timer(chart_parsing_timer, 1);
if(g_profiling)start_and_alloc_profiler(&unpacking_profiler, "unpacking", parse_profiler, chart_parsing_profiler);
t_forest = t_half + timer();
cforest = clock() - start;
if(do_forest)
{
char *sentence = current_sentence;
fflush(stdout);
if(sentence)fprintf(stderr, "SENT: %s\n", sentence);
int found_root = output_forest(&cells[chart_size-1], token_chart);
fflush(stdout);
fprintf(stderr, "NOTE: %d readings [forest], ", found_root);
print_slab_stats();
fprintf(stderr, "\n");
num_parses = 1;
}
else if(!do_itsdb && !lui_mode)
{
didsent = 0;
if(chart_size>0)
num_parses = iterate_cell_root_hypotheses(&cells[chart_size-1], parse_show_result, best_only);
else num_parses = 0;
fprintf(stderr, "NOTE: %d readings, added %d / %d edges to chart (%d fully instantiated, %d actives used, %d passives used)\t",
num_parses, total_edges, real_edges, passive_edges, used_edges, pused_edges);
print_slab_stats();
//rule_profiler(); // XXX comment me out normally
}
else if(do_itsdb)
{
num_parses = itsdb_report_parse(token_chart, lexical_chart, 0, (int)((long long)cforest*1000 / CLOCKS_PER_SEC));
}
else if(lui_mode)
{
num_parses = output_lui_trees(cells+chart_size-1, current_sentence, best_only, 0);
if(!num_parses){}//output_lui_chart(words);
last_token_chart = token_chart;
}
alarm(0);
// extern int output_edge_vectors;
// if(output_edge_vectors)
// do_output_edge_vectors();
t_unpack = timer();
used_total += used_edges;
pused_total += pused_edges;
if(0)
{
static FILE *f = 0;
//extern long long ndecomposed;
//static long long ond = 0;
if(!f)f = fopen("/tmp/times", "w");
fprintf(f, "%d %d %f %f %f %s\n", nwords, num_parses, t_setup, t_forest, t_unpack, current_sentence);
//ond = ndecomposed;
fflush(f);
}
return num_parses;
}
What is confusing to me is, no matter whether I use the first or the second command, the program still executes at least some(??) of the statements which are in the if(the_ubertagger && enable_ubertagging)
. In the below screenshot you can observe the execution being stopped at the breakpoint on line 532 (after somehow not stopping at line 531), and you can observe that the value of the variable enable_ubertagging
is definitely 0 (false). The the_ubertagger
pointer is 0x0 also.
Can anyone help me understand what is going on? How is this possible? I am using VS Code but I am observing the same output if I run the program via the terminal. Is there something special about these extern
variables which is making this possible? (Or is this an indication that I am looking not at what I think I am looking or something like that?)