csv.c

Go to the documentation of this file.
00001 /* csv - read write comma separated value format
00002  * Copyright (c) 2003 Michael B. Allen <mba2000 ioplex.com>
00003  *
00004  * The MIT License
00005  * 
00006  * Permission is hereby granted, free of charge, to any person obtaining a
00007  * copy of this software and associated documentation files (the "Software"),
00008  * to deal in the Software without restriction, including without limitation
00009  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
00010  * and/or sell copies of the Software, and to permit persons to whom the
00011  * Software is furnished to do so, subject to the following conditions:
00012  * 
00013  * The above copyright notice and this permission notice shall be included
00014  * in all copies or substantial portions of the Software.
00015  * 
00016  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00017  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00018  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
00019  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
00020  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
00021  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
00022  * OTHER DEALINGS IN THE SOFTWARE.
00023  */
00024 
00025 /* We (Juergen Haas and Tomasz Motylewski) execute our rights given above
00026  * to distribute and sublicence this file (csv.c) and csv.h, csv_defines.h
00027  * under General Pulic Licence version 2 or any later version.
00028  *
00029  * This file is derived from libmba : A library of generic C modules
00030  * http://www.ioplex.com/~miallen/libmba/dl/libmba-0.8.9.tar.gz
00031  */
00032 
00038 #ifdef HAVE_CONFIG_H
00039 #include <config.h>
00040 #endif /* HAVE_CONFIG_H */
00041 
00042 
00043 #include <stdlib.h>
00044 #include <string.h>
00045 #include <stdio.h>
00046 #include <ctype.h>
00047 #include <errno.h>
00048 #include <wchar.h>
00049 #include <wctype.h>
00050 
00051 #include "gerbv.h"
00052 #include "csv.h"
00053 #include "csv_defines.h"
00054 #define ST_START     1
00055 #define ST_COLLECT   2
00056 #define ST_TAILSPACE 3
00057 #define ST_END_QUOTE 4
00058 #define istspace iswspace
00059 
00060 
00061 struct sinput {
00062        FILE *in;
00063        const char *src;
00064        size_t sn;
00065        size_t count;
00066 };
00067 
00068 
00069 struct winput {
00070        const wchar_t *src;
00071        size_t sn;
00072        size_t count;
00073 };
00074 
00075 
00076 static int
00077 snextch(struct sinput *in)
00078 {
00079        int ch;
00080 
00081        if (in->in) {
00082               if ((ch = fgetc(in->in)) == EOF) {
00083                      if (ferror(in->in)) {
00084                             GERB_MESSAGE("errno:%d", errno);
00085                             return -1;
00086                      }
00087                      return 0;
00088               }
00089        } else {
00090               if (in->sn == 0) {
00091                      return 0;
00092               }
00093               ch = (unsigned char) *(in->src)++;
00094               in->sn--;
00095        }
00096        in->count++;
00097 
00098        return ch;
00099 }/* snextch */
00100 
00101 
00102 static int
00103 wnextch(struct winput *in)
00104 {
00105        int ch;
00106 
00107        if (in->sn == 0) {
00108               return 0;
00109        }
00110        ch = *(in->src)++;
00111        in->sn--;
00112        in->count++;
00113 
00114        return ch;
00115 }/* wnextch */
00116 
00117 static int
00118 csv_parse_str(struct sinput *in, char *buf, size_t bn, char *row[], int rn, int sep, int flags)
00119 {
00120        int trim, quotes, ch, state, r, j, t, inquotes;
00121 
00122        trim = flags & CSV_TRIM;
00123        quotes = flags & CSV_QUOTES;
00124        state = ST_START;
00125        inquotes = 0;
00126        ch = r = j = t = 0;
00127 
00128        memset(row, 0, sizeof(char *) * rn);
00129 
00130        while (rn && bn && (ch = snextch(in)) > 0) {
00131               switch (state) {
00132                      case ST_START:
00133                             if (ch != '\n' && ch != sep && isspace(ch)) {
00134                                    if (!trim) {
00135                                           buf[j++] = ch; bn--;
00136                                           t = j;
00137                                    }
00138                                    break;
00139                             } else if (quotes && ch == '"') {
00140                                    j = t = 0;
00141                                    state = ST_COLLECT;
00142                                    inquotes = 1;
00143                                    break;
00144                             }
00145                             state = ST_COLLECT;
00146                      case ST_COLLECT:
00147                             if (inquotes) {
00148                                    if (ch == '"') {
00149                                           state = ST_END_QUOTE;
00150                                           break;
00151                                    }
00152                             } else if (ch == sep || ch == '\n') {
00153                                    row[r++] = buf; rn--;
00154                                    buf[t] = '\0'; bn--;
00155                                    buf += t + 1;
00156                                    j = t = 0;
00157 
00158                                    state = ST_START;
00159                                    inquotes = 0;
00160                                    if (ch == '\n') {
00161                                           rn = 0;
00162                                    }
00163                                    break;
00164                             } else if (quotes && ch == '"') {
00165                                    errno = EILSEQ;
00166                                    GERB_MESSAGE("%d: unexpected quote in element",errno);
00167                                    return -1;
00168                             }
00169                             buf[j++] = ch; bn--;
00170                             if (!trim || isspace(ch) == 0) {
00171                                    t = j;
00172                             }
00173                             break;
00174                      case ST_TAILSPACE:
00175                      case ST_END_QUOTE:
00176                             if (ch == sep || ch == '\n') {
00177                                    row[r++] = buf; rn--;
00178                                    buf[j] = '\0'; bn--;
00179                                    buf += j + 1;
00180                                    j = t =  0;
00181                                    state = ST_START;
00182                                    inquotes = 0;
00183                                    if (ch == '\n') {
00184                                           rn = 0;
00185                                    }
00186                                    break;
00187                             } else if (quotes && ch == '"' && state != ST_TAILSPACE) {
00188                                    buf[j++] = '"';      bn--;          /* nope, just an escaped quote */
00189                                    t = j;
00190                                    state = ST_COLLECT;
00191                                    break;
00192                             } else if (isspace(ch)) {
00193                                    state = ST_TAILSPACE;
00194                                    break;
00195                             }
00196                             errno = EILSEQ;
00197                             GERB_MESSAGE("%d: bad end quote in element", errno);
00198                             return -1;
00199               }
00200        }
00201        if (ch <= 0) {
00202               /* treat EOF as EOL, so the last record is accepted even when
00203                  \n is not present. Some users parse strings, not lines */
00204               if(state == ST_TAILSPACE || state == ST_END_QUOTE
00205                      || (state == ST_COLLECT && ! inquotes)) {
00206                      row[r++] = buf; rn--;
00207                      buf[j] = '\0'; bn--;
00208                      buf += j + 1;
00209                      inquotes = 0;
00210                      rn = 0;
00211               } else {
00212        //            AMSG("");
00213                      return -1;
00214               }
00215        }
00216        if (bn == 0) {
00217               errno = E2BIG;
00218               GERB_MESSAGE("E2BIG %d ", errno);
00219               return -1;
00220        }
00221        if (rn) {
00222               if (inquotes) {
00223                      errno = EILSEQ;
00224                      GERB_MESSAGE("EILSEQ %d ", errno);
00225                      return -1;
00226               }
00227               row[r] = buf;
00228               buf[t] = '\0';
00229        }
00230        // return error if we can't read the minimum number of fields
00231        if (r < 4) {
00232               return -1;
00233        }
00234        return in->count;
00235 }/* csv_parse_str */
00236 
00237 
00238 static int
00239 csv_parse_wcs(struct winput *in, wchar_t *buf, size_t bn, wchar_t *row[], int rn, wint_t sep, int flags)
00240 {
00241        int trim, quotes, state, r, j, t, inquotes;
00242        wint_t ch;
00243 
00244        trim = flags & CSV_TRIM;
00245        quotes = flags & CSV_QUOTES;
00246        state = ST_START;
00247        inquotes = 0;
00248        ch = r = j = t = 0;
00249 
00250        memset(row, 0, sizeof(wchar_t *) * rn);
00251 
00252        while (rn && bn && (ch = wnextch(in)) > 0) {
00253               switch (state) {
00254                      case ST_START:
00255                             if (ch != L'\n' && ch != sep && iswspace(ch)) {
00256                                    if (!trim) {
00257                                           buf[j++] = ch; bn--;
00258                                           t = j;
00259                                    }
00260                                    break;
00261                             } else if (quotes && ch == L'"') {
00262                                    j = t = 0;
00263                                    state = ST_COLLECT;
00264                                    inquotes = 1;
00265                                    break;
00266                             }
00267                             state = ST_COLLECT;
00268                      case ST_COLLECT:
00269                             if (inquotes) {
00270                                    if (ch == L'"') {
00271                                           state = ST_END_QUOTE;
00272                                           break;
00273                                    }
00274                             } else if (ch == sep || ch == L'\n') {
00275                                    row[r++] = buf; rn--;
00276                                    buf[t] = L'\0'; bn--;
00277                                    buf += t + 1;
00278                                    j = t = 0;
00279                                    state = ST_START;
00280                                    inquotes = 0;
00281                                    if (ch == L'\n') {
00282                                           rn = 0;
00283                                    }
00284                                    break;
00285                             } else if (quotes && ch == L'"') {
00286                                    errno = EILSEQ;
00287                                    GERB_MESSAGE("%d: unexpected quote in element", errno);
00288                                    return -1;
00289                             }
00290                             buf[j++] = ch; bn--;
00291                             if (!trim || iswspace(ch) == 0) {
00292                                    t = j;
00293                             }
00294                             break;
00295                      case ST_TAILSPACE:
00296                      case ST_END_QUOTE:
00297                             if (ch == sep || ch == L'\n') {
00298                                    row[r++] = buf; rn--;
00299                                    buf[j] = L'\0'; bn--;
00300                                    buf += j + 1;
00301                                    j = t =  0;
00302                                    state = ST_START;
00303                                    inquotes = 0;
00304                                    if (ch == L'\n') {
00305                                           rn = 0;
00306                                    }
00307                                    break;
00308                             } else if (quotes && ch == L'"' && state != ST_TAILSPACE) {
00309                                    buf[j++] = L'"'; bn--;              /* nope, just an escaped quote */
00310                                    t = j;
00311                                    state = ST_COLLECT;
00312                                    break;
00313                             } else if (iswspace(ch)) {
00314                                    state = ST_TAILSPACE;
00315                                    break;
00316                             }
00317                             errno = EILSEQ;
00318                             GERB_MESSAGE("%d: bad end quote in element ", errno);
00319                             return -1;
00320               }
00321        }
00322        if (ch <= 0) {
00323               /* treat EOF as EOL, so the last record is accepted even when
00324                  \n is not present. Some users parse strings, not lines */
00325               if(state == ST_TAILSPACE || state == ST_END_QUOTE
00326                      || (state == ST_COLLECT && ! inquotes)) {
00327                      row[r++] = buf; rn--;
00328                      buf[j] = L'\0'; bn--;
00329                      buf += j + 1;
00330                      inquotes = 0;
00331                      rn = 0;
00332               } else {
00333        //            AMSG("");
00334                      return -1;
00335               }
00336        }
00337        if (bn == 0) {
00338               errno = E2BIG;
00339        GERB_MESSAGE("%d", errno);
00340               return -1;
00341        }
00342        if (rn) {
00343               if (inquotes) {
00344                      errno = EILSEQ;
00345               GERB_MESSAGE("%d", errno);
00346                      return -1;
00347               }
00348               row[r] = buf;
00349               buf[t] = L'\0';
00350        }
00351 
00352        return in->count;
00353 }/*csv_row_parse_wcs*/
00354 
00355 
00356 int
00357 csv_row_parse_wcs(const wchar_t *src, size_t sn, wchar_t *buf, size_t bn, wchar_t *row[], int rn, int sep, int trim)
00358 {
00359        struct winput input;
00360        input.src = src;
00361        input.sn = sn;
00362        input.count = 0;
00363        return csv_parse_wcs(&input, buf, bn, row, rn, (wint_t)sep, trim);
00364 }/*csv_row_parse_wcs*/
00365 
00366 
00367 int
00368 csv_row_parse_str(const char *src, size_t sn, char *buf, size_t bn, char *row[], int rn, int sep, int trim)
00369 {
00370        struct sinput input;
00371        input.in = NULL;
00372        input.src = src;
00373        input.sn = sn;
00374        input.count = 0;
00375        return csv_parse_str(&input, buf, bn, row, rn, sep, trim);
00376 }/*csv_row_parse_str*/
00377 
00378 
00379 int
00380 csv_row_fread(FILE *in, char *buf, size_t bn, char *row[], int numcols, int sep, int trim)
00381 {
00382        struct sinput input;
00383        input.in = in;
00384        input.count = 0;
00385        return csv_parse_str(&input, buf, bn, row, numcols, sep, trim);
00386 }/*csv_row_fread*/
00387 

Generated on Tue Aug 19 00:14:48 2008 for gerbv by  doxygen 1.5.6