Use Bst Below As Example feel free to use Whatever Feels Easiest
Ā
#include "bst.h"
/**
* This function assumes that the key is allocated on the heap already
*/
node* insert(node *temp_root, char *key, double value) {
if (temp_root == NULL) {
temp_root = malloc(sizeof(node));
temp_root->key = key;
temp_root->value = value;
temp_root->left = NULL;
temp_root->right = NULL;
} else if (strcmp(key, temp_root->key) < 0) {
temp_root->left = insert(temp_root->left, key, value);
} else if (strcmp(key, temp_root->key) > 0) {
temp_root->right = insert(temp_root->right, key, value);
} else {
printf("%s already exists in the tree. No action taken\n", key);
}
return temp_root;
}
/**
* This functions searches for a key in a tree
* Returns the node that has the key or NULL otherwise
*/
node* find(node *temp_root, char *key) {
node *result = NULL;
if (temp_root != NULL) {
if (strcmp(key, temp_root->key) == 0) {
result = temp_root; // I found it
} else if (strcmp(key, temp_root->key) < 0) {
result = find(temp_root->left, key);
} else {
result = find(temp_root->right, key);
}
}
return result;
}
void print_helper(node *tree, FILE *out) {
if (tree != NULL) {
print_helper(tree->left, out);
fprintf(out, "%s\t%f\n", tree->key, tree->value);
print_helper(tree->right, out);
}
}
/**
* Print contents of a tree in order
*/
void tprint(node *tree, char *file_name) {
FILE *out = fopen(file_name, "w");
print_helper(tree, out);
fclose(out);
}
/**
* Reclaim memory on the heap
*/
void clear(node *tree) {
if (tree != NULL) {
clear(tree->left);
clear(tree->right);
free(tree->key);
free(tree);
}
}
Ā
Use Gutenberg Below As Example feel free to use Whatever Feels Easiest
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include "bst.h"
const char *delim = "&";
// You may want to change the slash if you are working on Windows to '\'
const char slash = '/';
/**
Ā * Walk the bigram tree and calculate the conditional probability
Ā */
void print_prob_helper(node *uni_tree, node *bi_tree, FILE *out) {
Ā if (bi_tree != NULL) {
Ā // Walk the left sub-tree first
Ā print_prob_helper(uni_tree, bi_tree->left, out);
Ā // Process the current node
Ā char *bigram = bi_tree->key;
Ā double bi_count = bi_tree->value;
Ā // Find the delim
Ā int l1 = strlen(bigram);
Ā int index = -1;
Ā for (int i = 0; i < l1; i++) {
Ā Ā if (bigram[i] == delim[0]) {
Ā Ā index = i;
Ā Ā break;
Ā Ā }
Ā }
Ā if (index == -1) {
Ā Ā fprintf(stderr, "Something wrong with this bigram: %s.\n", bigram);
Ā Ā fprintf(stderr, "The program terminated abnormally.\n");
Ā Ā exit(1);
Ā } else {
Ā Ā // Extract the first word from the bigram
Ā Ā char w1[index + 1];
Ā Ā for (int j = 0; j < index; j++) {
Ā Ā w1[j] = bigram[j];
Ā Ā }
Ā Ā w1[index] = '\0';
Ā Ā // Extract the second word from the bigram
Ā Ā int l2 = l1 - index - 1;
Ā Ā char w2[l2 + 1];
Ā Ā for (int k = 0; k < l2; k++) {
Ā Ā w2[k] = bigram[k + index + 1];
Ā Ā }
Ā Ā w2[l2] = '\0';
Ā Ā // Find count of unigram
Ā Ā node *uni_node = find(uni_tree, w1);
Ā Ā if (uni_node != NULL) {
Ā Ā double uni_count = uni_node->value;
Ā Ā // P(WORD(k)|WORD(k-1)) = p
Ā Ā fprintf(out, "P(%s|%s) = %f\n", w2, w1, bi_count / uni_count);
Ā Ā } else {
Ā Ā fprintf(stderr,
Ā Ā Ā "Could not find a node for this unigram (extracted from %s): %s.\n",
Ā Ā Ā bigram, w1);
Ā Ā fprintf(stderr, "The program terminated abnormally.\n");
Ā Ā exit(1);
Ā Ā }
Ā }
Ā // Walk the right sub-tree
Ā print_prob_helper(uni_tree, bi_tree->right, out);
Ā }
}
/**
Ā * Print contents of a tree in order
Ā */
void print_prob(node *uni_tree, node *bi_tree, char *file_name) {
Ā // Pre-conditions
Ā if (uni_tree == NULL) {
Ā Ā
Ā fprintf(stderr, "Something wrong with the uni tree.\n");
Ā fprintf(stderr, "The program terminated abnormally.\n");
Ā exit(1);
Ā }
Ā if (bi_tree == NULL) {
Ā fprintf(stderr, "Something wrong with the bi tree.\n");
Ā fprintf(stderr, "The program terminated abnormally.\n");
Ā exit(1);
Ā }
Ā FILE *out = fopen(file_name, "w");
Ā print_prob_helper(uni_tree, bi_tree, out);
Ā fclose(out);
}
node* update_tree(node *tree, char *w) {
Ā node *n = find(tree, w);
Ā if (n == NULL) {
Ā tree = insert(tree, w, 1);
Ā } else {
Ā n->value = n->value + 1;
Ā }
Ā return tree;
}
node* update_uni_tree(node *tree, char *w) {
Ā char *mono;
Ā mono = strdup(w);
Ā return update_tree(tree, mono);
}
/**
Ā * Invalid bigram: ?v@??tremendous&force, w1: tremendous, w2: force
Ā */
node* update_bi_tree(node *tree, char *w1, char *w2) {
Ā char *bi;
Ā int l;
Ā l = strlen(w1) + 1 + strlen(w2) + 1;
Ā bi = (char*) malloc(l);
Ā strcpy(bi, w1);
Ā strcat(bi, delim);
Ā strcat(bi, w2);
Ā return update_tree(tree, bi);
}
void process_file(char *file_name) {
Ā // Make output file names
Ā int l = strlen(file_name);
Ā char uni_file_name[l + 5];
Ā char bi_file_name[l + 4];
Ā char cp_file_name[l + 4];
Ā strcpy(uni_file_name, file_name);
Ā strcat(uni_file_name, ".uni");
Ā strcpy(bi_file_name, file_name);
Ā strcat(bi_file_name, ".bi");
Ā strcpy(cp_file_name, file_name);
Ā strcat(cp_file_name, ".cp");
Ā printf("\nProcessing: %s\n", file_name);
Ā printf("Results will be written to:\n");
Ā printf("\t%s\n", uni_file_name);
Ā printf("\t%s\n", bi_file_name);
Ā printf("\t%s\n", cp_file_name);
Ā node *uni_tree = NULL;
Ā node *bi_tree = NULL;
Ā FILE *in = fopen(file_name, "r");
Ā char previous_word[100];
Ā char current_word[100];
Ā int index = 0;
Ā char c;
Ā bool can_update_bi = false;
Ā bool is_previous_good = false;
Ā while ((c = fgetc(in)) != EOF) {
Ā // A-Z
Ā if (c >= 65 && c <= 90) {
Ā Ā c = c + 32; // convert to lower case
Ā Ā current_word[index] = c;
Ā Ā index++;
Ā Ā is_previous_good = true;
Ā } else if (c >= 97 && c <= 122) {
Ā Ā current_word[index] = c;
Ā Ā index++;
Ā Ā is_previous_good = true;
Ā } else if (is_previous_good) { // not a letter
Ā Ā current_word[index] = '\0';
Ā Ā uni_tree = update_uni_tree(uni_tree, current_word);
Ā Ā if (can_update_bi) {
Ā Ā bi_tree = update_bi_tree(bi_tree, previous_word, current_word);
Ā Ā } else {
Ā Ā can_update_bi = true;
Ā Ā }
Ā Ā // Prepare for next word
Ā Ā strcpy(previous_word, current_word);
Ā Ā index = 0;
Ā Ā // the trees will not be updated if two non-alphabetical letters follow each other
Ā Ā is_previous_good = false;
Ā }
Ā }
Ā // The input file is no longer needed
Ā fclose(in);
Ā // Write results
Ā tprint(uni_tree, uni_file_name);
Ā tprint(bi_tree, bi_file_name);
Ā print_prob(uni_tree, bi_tree, cp_file_name);
Ā // Reclaim memory occupied by the trees
Ā clear(uni_tree);
Ā clear(bi_tree);
}
int main(int argc, char *argv[]) {
Ā if (argc == 1) {
Ā printf("Use: %s file_name_1.txt file_name_2.txt\n", argv[0]);
Ā } else {
Ā for (int i = 1; i < argc; i++) {
Ā Ā process_file(argv[i]);
Ā }
Ā }
Ā return 0;
}