OiO.lk Blog C++ Generic QuickSort implementation struggles with a lot of strings
C++

Generic QuickSort implementation struggles with a lot of strings


Ok basically i made this quicksort implementation in C for uni that is supposed to be able to recieve any type of array and sort it. For a stress test I was provided a records.csv file with 20mil lines structured like this: <id [int]>,<field1 [string]>,<field2 [int]>,field3 [float]>

My algorithm has to sort the file lines either by sorting by field1 or field2 or field3 (specified in the user input)

The problem is that, even if sorting by field2 and 3 works perfectly fine (on my machine in 12/13 sec), sorting by field1 causes immense wait time and a subsequential segmentation fault. This doesn’t happen tho if I try and sort a smaller set of lines (i tried 1k) and it works just fine. I also have a mergesort implementation in the same file that works even if it takes like 25sec, so I assumed it’s not a problem with the csv file.

I’m leaving the sortlib.c file with the algorithm and the mass_test.c file with the csv file reader/writer.

Every suggestion is gold to me so ty in advance 🙂

P.S.
the full code is on github at "https://github.com/SamueleTondaRoc/SortLib-for-uni"

sortlib.h only includes some needed libraries and prototypes for merge_sort and quick_sort

sortlib.c:

#include "sortlib.h"

#define BASE(i) ((base) + ((i) * size))

void swap (void *a, void *b, size_t size, void* temp)
{
    memcpy (temp, a, size);
    memcpy (a, b, size);
    memcpy (b, temp, size);
}

size_t split (int low, int hig, void* base, size_t size, int (*compar)(const void*, const void*), void* temp)
{
    int i = low - 1, j = low, mid = low + ((hig - low) / 2), pivot = hig;
    
    if (!((*compar)(BASE (low), BASE (mid)) + (*compar)(BASE (low), BASE (hig)))) swap (BASE (low), BASE (hig), size, temp);
    else if (!((*compar)(BASE (mid), BASE (low)) + (*compar)(BASE (mid), BASE (hig)))) swap (BASE (mid), BASE (hig), size, temp);

    while (j < hig)
    {
        if ((*compar)(BASE (j), BASE (pivot)) < 0) 
        {
            i++;
            swap (BASE (i), BASE (j), size, temp);
        }
        j++;
    }
    swap (BASE (i + 1), BASE (pivot), size, temp);
    pivot = i + 1;
    return pivot;
}

/*
    @brief recursive method for the quick sort
*/
void quick_sort_R (int low, int hig, void *base, size_t size, int (*compar)(const void*, const void*), void* temp)
{
    if (low < hig)
    {
        int pivot = split (low, hig, base, size, compar, temp);
        
        quick_sort_R (low, pivot - 1, base, size, compar, temp);
        quick_sort_R (pivot + 1, hig, base, size, compar, temp);
    }
}

/*
    @brief quick sort for any type of array
    @param base pointer to the first element of the arry
    @param nitems lenght of the array
    @param size size of array items
    @param compar compare function
    @warning needs a compare function such that if the first elem is "bigger" than the second it returns 1, if it's equal it ret 0, else -1
*/
void quick_sort (void *base, size_t nitems, size_t size, int (*compar)(const void*, const void*)) 
{
    void *temp = (void*) malloc (size);
    if (base && compar && nitems)
        quick_sort_R (0, nitems - 1, base, size, compar, temp);
    free (temp);
}

mass_tester.c:

#include "support.h"
#include "../src/sortlib.h"
#include <stdint.h>
#include <time.h>

#define MAX_SIZE 100

typedef struct data_line {
    int id;
    char field1[MAX_SIZE];
    int field2;
    float field3;
} Line, *Line_ptr;

size_t calc_csvFile_len (FILE *file)
{
    size_t len = 0;
    printf ("calculating file lenght...\n");
    __clock_t start = clock(), end;

        char s[MAX_SIZE];
    while(fscanf (file, " %99[^\n]", s) == 1) len++;

    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("file lenght: %ld\nTook: %.2fsec\n\n", len, algo_time);

    fseek (file, 0L, SEEK_SET);
    return len;
}

void gather_data (FILE *infile, Line_ptr *data)
{
    int id, field2;
    char field1[MAX_SIZE];
    float field3;

    printf ("reading data...\n");
    __clock_t start = clock(), end;

    for (size_t i = 0; !feof (infile); i++)
    {
        fscanf (infile, "%d,%[^,],%d,%f\n", &id, field1, &field2, &field3);
        (*data)[i].id = id;
        strcpy ((*data)[i].field1, field1);
        (*data)[i].field2 = field2;
        (*data)[i].field3 = field3;
    }
    fclose (infile);

    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("data loading complete!\nTook: %.2fsec\n\n", algo_time);
}

void write_data (FILE *outfile, Line_ptr *data, size_t len)
{
    printf ("writing data...\n");
    __clock_t start = clock(), end;
    
    for (size_t i = 0; i < len; i++)
    {
        fprintf (outfile, "%d,%s,%d,%f\n", (*data)[i].id, (*data)[i].field1, (*data)[i].field2, (*data)[i].field3);
    }
    fclose (outfile);

    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("data writing complete!\nTook: %.2fsec\n\n", algo_time);
}

int cmp_field1 (const void* a, const void* b)
{
    char A[15], B[15];
    strcpy (A, ((Line_ptr) a) -> field1);
    strcpy (B, ((Line_ptr) b) -> field1);
    int cmp = strcmp (A, B);
    return cmp > 0 ? 1 : (cmp < 0 ? -1 : 0);
}

int cmp_field2 (const void* a, const void* b)
{
    int A = ((Line_ptr) a) -> field2;
    int B = ((Line_ptr) b) -> field2;
    return A < B ? -1 : (A == B ? 0 : 1);
}

int cmp_field3 (const void* a, const void* b)
{
    float A = ((Line_ptr) a) -> field3;
    float B = ((Line_ptr) b) -> field3;
    return A < B ? -1 : (A == B ? 0 : 1);
}

void sort_records (FILE *infile, FILE *outfile, size_t field, size_t algo)
{

    // preparing the array 
    size_t len = calc_csvFile_len (infile);
    Line_ptr data = (Line_ptr) malloc (sizeof(Line) * len);

    // filling the array with data from file
    gather_data (infile, &data);

    printf ("sorting...\n");

    // defining compare func
    int (*cmp_func)(const void*, const void*) = field == 1 ? (*cmp_field1) : (field == 2 ? (*cmp_field2) : (*cmp_field3));  

    // chosing sorting algorithm
    __clock_t start = clock(), end;
    if (algo == 1) merge_sort ((void**) &data, len, sizeof(Line), cmp_func);
    else quick_sort ((void*) data, len, sizeof(Line), cmp_func);
    end = clock();
    double algo_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf ("array sorted in %.2f seconds\n\n", algo_time);

    // dumping the array data in an output file
    write_data(outfile, &data, len);
    free (data);
}

int main (int argc, char **argv)
{
    // reading command
    char* in_file_path = argv[1];
    char* out_file_path = argv[2];
    size_t field = (int) (*argv[3] - '0'),
            algo = (int) (*argv[4] - '0');

    // opening files (the closing happens in the gather_data and write_data methods)
    FILE *infile = NULL, *outfile = NULL;
    if (in_file_path) infile = fopen (in_file_path, "r");
    if (out_file_path) outfile = fopen (out_file_path, "w");

    // checking inputs and launching sorter
    if (infile && outfile && (field > 0 && field < 4) && (algo > 0 && algo < 3)) 
        sort_records (infile, outfile, field, algo);    
    else 
        printf("error in passed data: \"%s %s %s %ld %ld\"\n", argv[0], in_file_path, out_file_path, field, algo);  // info dump in case of error

    return 0;
}



You need to sign in to view this answers

Exit mobile version