SMOLNET PORTAL home about changes
/* index qi data file */
/* Bruce Tanner - Cerritos College */

/* 1.0  1993/08/14 Start with build_index */
/* 1.1  1993/08/30 Make fopen failure more explicit */
/* 1.2  1993/09/04 Move soundex creation outside */

#include ssdef
#include stdio
#include string
#include ctype
#include rms
#include descrip
#include climsgdef
#include assert
#include "qi.h"


char idx_record[IDX_RECORD_SIZE + 1];
char idx_key[IDX_KEY_SIZE + 1];
char dat_record[DAT_RECORD_SIZE + 1];
char dat_key[DAT_KEY_SIZE + 1];
int  field_attrib[MAX_FIELD];
int mode = 0;

#define CREATE 1
#define MERGE 2

void read_fields(char *);
void index_words(char *, struct RAB *, struct RAB *);
struct dsc$descriptor_s *descr(char *);
void build_commands();
int lib$get_foreign(), lib$get_input();

main(int argc, char *argv[])
{

    FILE  *src;
    char  cli_input[256], file_arg[256], file_spec[256];
    char  idx_name[256], dat_name[256];
    char  *ptr, field[DATA_SIZE + 1];
    char  dat_copy[DAT_RECORD_SIZE + 1];
    int   status, context = 0, count = 0;
    short leng;
    struct FAB idxfab, datfab;
    struct RAB idxrab, datrab;
    struct XABKEY idxxab, datxab;
    $DESCRIPTOR(input_dsc, cli_input);
    $DESCRIPTOR(file_dsc, file_arg);
    $DESCRIPTOR(file_spec_dsc, file_spec);
    $DESCRIPTOR(idx_dsc, idx_name);

    status = lib$get_foreign(&input_dsc, 0, &leng, 0);

    strncpy(cli_input+6, cli_input, leng);
    strncpy(cli_input, "build ", 6);

    status = cli$dcl_parse(&input_dsc, build_commands, lib$get_input);

    if (status != CLI$_NORMAL)  /* error in parse, exit */
        exit(1);

    if ((cli$present(descr("file")) & 1) == 0) {
        printf("Usage: build data_file /data/create/merge/config=.../output=...\n");
        exit(3);
    }

    status = cli$get_value(descr("file"), &file_dsc, &leng);  /* get source */

    status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
    ptr = strchr(file_spec, ' ');
    if (ptr) *ptr = '\0';            /* chop off trailing spaces */
    strcpy(idx_name, file_spec);    /* make copy for output spec */

    if (cli$present(descr("output")) & 1) { /* if /output, overwrite out_name */
        status = cli$get_value(descr("output"), &idx_dsc, &leng);
        idx_name[leng] = '\0';
    }

    if (cli$present(descr("create")) & 1)
        mode = CREATE;
    if (cli$present(descr("merge")) & 1)
        mode = MERGE;

    ptr = strrchr(idx_name, '.');  /* just get file name */
    if (ptr) *ptr = '\0';
    strcat(idx_name, ".INDEX");

    idxfab = cc$rms_fab;
    idxfab.fab$b_bks = 6;
    idxfab.fab$b_fac = FAB$M_GET | FAB$M_PUT;
    idxfab.fab$l_fna = idx_name;
    idxfab.fab$b_fns = strlen(idx_name);
    idxfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW;
    idxfab.fab$w_mrs = IDX_RECORD_SIZE;
    idxfab.fab$b_org = FAB$C_IDX;
    idxfab.fab$b_rat = FAB$M_CR;
    idxfab.fab$b_rfm = FAB$C_FIX;
    idxfab.fab$b_shr = FAB$M_NIL;
    idxfab.fab$l_xab = &idxxab;

    idxrab = cc$rms_rab;
    idxrab.rab$l_fab = &idxfab;
    idxrab.rab$b_krf = 0;
    idxrab.rab$l_kbf = idx_key;
    idxrab.rab$b_ksz = IDX_KEY_SIZE;
    idxrab.rab$b_rac = RAB$C_KEY;
    idxrab.rab$l_rbf = idx_record;
    idxrab.rab$w_rsz = IDX_RECORD_SIZE;
    idxrab.rab$l_ubf = idx_record;
    idxrab.rab$w_usz = IDX_RECORD_SIZE;
    idxrab.rab$b_mbf = 20;
    idxrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH;

    idxxab = cc$rms_xabkey;
    idxxab.xab$b_dtp = XAB$C_STG;
    idxxab.xab$b_flg = XAB$M_IDX_NCMPR;
    idxxab.xab$w_pos0 = 0;
    idxxab.xab$b_siz0 = IDX_KEY_SIZE;
    idxxab.xab$b_ref = 0;

    strcpy(dat_name, idx_name);
    ptr = strrchr(dat_name, '.');  /* just get file name */
    if (ptr) *ptr = '\0';
    strcat(dat_name, ".DATA");

    datfab = cc$rms_fab;
    datfab.fab$b_bks = 9;
    datfab.fab$b_fac = FAB$M_GET | FAB$M_PUT | FAB$M_UPD;
    datfab.fab$l_fna = dat_name;
    datfab.fab$b_fns = strlen(dat_name);
    datfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW;
    datfab.fab$w_mrs = DAT_RECORD_SIZE;
    datfab.fab$b_org = FAB$C_IDX;
    datfab.fab$b_rat = FAB$M_CR;
    datfab.fab$b_rfm = FAB$C_VAR;
    datfab.fab$b_shr = FAB$M_NIL;
    datfab.fab$l_xab = &datxab;

    datrab = cc$rms_rab;
    datrab.rab$l_fab = &datfab;
    datrab.rab$b_krf = 0;
    datrab.rab$l_kbf = dat_key;
    datrab.rab$b_ksz = DAT_KEY_SIZE;
    datrab.rab$b_rac = RAB$C_KEY;
    datrab.rab$l_rbf = dat_record;
    datrab.rab$b_mbf = 20;
    datrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH;

    datxab = cc$rms_xabkey;
    datxab.xab$b_dtp = XAB$C_STG;
    datxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR;
    datxab.xab$w_pos0 = 0;
    datxab.xab$b_siz0 = DAT_KEY_SIZE;
    datxab.xab$b_ref = 0;


    /* open index file */
    if (mode == CREATE)
        if (((status = sys$create(&idxfab)) & 1) != SS$_NORMAL)
            lib$stop(status);
    if (mode == MERGE)
        if (((status = sys$open(&idxfab)) & 1) != SS$_NORMAL)
            lib$stop(status);
    if (((status = sys$connect(&idxrab)) & 1) != SS$_NORMAL)
        lib$stop(status);

    /* open data file */
    if (cli$present(descr("DATA")) & 1) {
        if (mode == CREATE)
            if (((status = sys$create(&datfab)) & 1) != SS$_NORMAL)
                lib$stop(status);
        if (mode == MERGE)
            if (((status = sys$open(&datfab)) & 1) != SS$_NORMAL)
                lib$stop(status);
        if (((status = sys$connect(&datrab)) & 1) != SS$_NORMAL)
            lib$stop(status);
    }

    /* record the fields with Indexed attribute */
    read_fields(file_spec);

    for (;;) {  /* process all files in input spec, first one already found */

        if ((src = fopen(file_spec, "r", "mbc=50", "mbf=20")) == NULL) { 
            printf("Can't read input file %s\n", file_spec);
            exit(5);
        }
        printf("Building index for %s\n", file_spec);

        while (fgets(dat_record, sizeof(dat_record), src)) {
            if ((ptr = strchr(dat_record, '\r')) ||
                (ptr = strchr(dat_record, '\n')))
                *ptr = '\0';              /* remove newline */

            if (strlen(dat_record) == 0)
                continue;                 /* skip blank lines */

            if ((++count % 500) == 0)
                printf("%d\n", count);

            /* if /DATA requested, write .data file record */
            if (cli$present(descr("data")) & 1) {
                strncpy(dat_key, dat_record, DAT_KEY_SIZE);
                datrab.rab$w_rsz = strlen(dat_record);
                if ((status = sys$put(&datrab)) != RMS$_NORMAL) {
                    if ((status == RMS$_DUP) && (mode == MERGE)) {
                        status = sys$find(&datrab);
                        status = sys$update(&datrab);    /* update the record */
                    }
                    if (status != RMS$_NORMAL) {
                        printf("DATA key (%d chars) %s\n", strlen(dat_key), dat_key);
                        printf("DATA rec (%d chars) %s\n", strlen(dat_record), dat_record);
                        lib$stop(status);
                    }
                }
            }
            strcpy(dat_copy, dat_record);

            /* if this is an indexed field, write index record(s) */
            strncpy(field, dat_copy + ID_SIZE, FIELD_SIZE);
            field[FIELD_SIZE] = '\0';
            if (field_attrib[atoi(field)] & ATTR_INDEXED) {
                for (ptr = dat_copy; *ptr; ptr++)
                    if (iscntrl(*ptr))  *ptr = ' ';  /* convert tabs to spaces */
                while ((strlen(dat_copy) > 0) &&
                       (dat_copy[strlen(dat_copy)-1] == ' '))
                    dat_copy[strlen(dat_copy)-1] = '\0';/* remove trailing blanks */
                for (ptr = dat_copy; *ptr; ptr++)
                    *ptr = _tolower(*ptr);           /* force lowercase */
        
                index_words(dat_copy, &idxrab, &datrab);
            }
        }

        fclose(src);
        status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0);
        if ((status & 1) == 0) {
            lib$find_file_end(&context);
            break;
        }
        ptr = strchr(file_spec, ' ');
        if (ptr) *ptr = '\0';            /* chop off trailing spaces */
    }
    if (cli$present(descr("data")) & 1)
        sys$close(&datfab);
    sys$close(&idxfab);
}


/* break data field into words and write them to index file */

void index_words(char *line, struct RAB *idxptr, struct RAB *datptr)
{
    char data[DATA_SIZE + 2], field[FIELD_SIZE + 1], id[ID_SIZE + 1];
    char *cp, *cp2;
    int  status;

    strncpy(id, line, ID_SIZE);
    id[ID_SIZE] = '\0';
    strncpy(field, line + ID_SIZE, FIELD_SIZE);
    field[FIELD_SIZE] = '\0';
    strncpy(data, line + ID_SIZE + FIELD_SIZE + SEQ_SIZE + ATTR_SIZE, DATA_SIZE);
    data[DATA_SIZE] = '\0';

    /* special hack to omit indexing the email domain */
    if ((strcmp(field, EMAIL_FIELD) == 0) && (cp = strchr(data, '@')))
        *cp = '\0';

#if NAME_HACK
    if (strcmp(field, NAME_FIELD) == 0) /* only edit name field */
        for (cp = data; *cp; cp++) {    /* apply any special editing to names */
            if (*cp == '-')  *cp = ' '; /* index both hyphenated names */
            if (*cp == '\'') strcpy(cp, cp+1); /* squeeze out apostrophe */
        }
#endif

    strcat(data, " ");              /* line ends with a space */
    cp = data;
    while(cp2 = strchr(cp, ' ')) {  /* break at space boundary */
        *cp2 = '\0';
        if (strlen(cp) > KEYWORD_SIZE)
            printf("Truncating %d character word /%s/ to %d characters\n",
                   strlen(cp), cp, KEYWORD_SIZE);
        if (strlen(cp) >= MIN_KEYWORD) {
            sprintf(idx_record, "%-*.*s%s%s", KEYWORD_SIZE, KEYWORD_SIZE, cp, field, id);
            strncpy(idx_key, idx_record, IDX_KEY_SIZE);
            idx_key[IDX_KEY_SIZE] = '\0';
            if ((field_attrib[atoi(field)] & ATTR_UNIQUE) &&
                ((status = sys$get(idxptr)) & 1))  /* unique record found? */
                    printf("Omit duplicate unique record: %s\n", line);
            else {
                idxptr->rab$w_rsz = IDX_RECORD_SIZE;
                if (((status = sys$put(idxptr)) & 1) == 0)
                    if (status != RMS$_DUP)
                        lib$stop(status);
            }
        }
        cp = cp2 + 1;
    }
}


char * get_field(char *ptr, char *field)
{
    int ind;

    for (ind= 0; *ptr != '\0' && *ptr != ':'; ptr++, ind++)
        field[ind] = _tolower(*ptr);
    field[ind] = '\0';
    if (*ptr == ':') ptr++;  /* skip over terminating ":" */
    return ptr;
}


void read_fields(char *file)
{
    FILE *cnf;
    char *ptr, config[256], line[256], field[128];
    int ind, field_num;
    short leng;
    $DESCRIPTOR(config_dsc, config);

    if (cli$present(descr("configuration")) & 1) { /* if /config */
        cli$get_value(descr("configuration"), &config_dsc, &leng);
        config[leng] = '\0';
    }
    else {                      /* no /config switch */
        strcpy(config, file);
        ptr = strrchr(config, '.');
        if (ptr) *ptr = '\0';
        strcat(config,".cnf");
    }

    for (ind = 0; ind < MAX_FIELD; ind++)
        field_attrib[ind] = 0;                  /* init array */

    if ((cnf = fopen(config, "r", "dna=.cnf")) == NULL) {
        printf("Can't read config file %s\n", config);
        exit(7);
    }

    while (fgets(line, sizeof(line), cnf)) {
        ptr = strchr(line, '\n');
        if (ptr) *ptr = '\0';                   /* remove newline */

        ptr = line;
        if ((*ptr == '#') || (*ptr == '\0'))    /* comment or blank? */
            continue;                           /* yes, skip line */
        ptr = get_field(ptr, field);            /* field number */
        field_num = atoi(field);

        ptr = get_field(ptr, field);            /* field name */
        ptr = get_field(ptr, field);            /* field size */
        ptr = get_field(ptr, field);            /* field description */
        ptr = get_field(ptr, field);            /* field option */

        for (;;) {
            ptr = get_field(ptr, field);	/* get attribute */
            if (strlen(field) == 0)
                break;                          /* no more attributes */

            /* attributes are unique to one letter */
            for (ind = 0; ind < MAX_ATTRIBUTES; ind++)
                if (field[0] == _tolower(attributes[ind].name[0]))
                    field_attrib[field_num] |= attributes[ind].value;
        }
    }

    fclose(cnf);
}

/* descr() creates character descriptor and returns
 * the address of the descriptor to the caller.
 */
# define N_DESCR 10
static struct dsc$descriptor_s str_desc[N_DESCR];
static int cur_descr = -1;

struct dsc$descriptor_s *descr(char *string)
{
    if(++cur_descr >= N_DESCR) cur_descr = 0;
    str_desc[cur_descr].dsc$w_length=(short)strlen(string);      
    str_desc[cur_descr].dsc$b_dtype=DSC$K_DTYPE_T;   
    str_desc[cur_descr].dsc$b_class=DSC$K_CLASS_S;  
    str_desc[cur_descr].dsc$a_pointer=string;     
    return (&str_desc[cur_descr]);
}
.
Response: text/plain
Original URLgopher://bitreich.org/0/gopher2007/2007-gopher-mirror/gop...
Content-Typetext/plain; charset=utf-8