PostgreSQL Source Code git master
Loading...
Searching...
No Matches
copyfromparse.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * copyfromparse.c
4 * Parse CSV/text/binary format for COPY FROM.
5 *
6 * This file contains routines to parse the text, CSV and binary input
7 * formats. The main entry point is NextCopyFrom(), which parses the
8 * next input line and returns it as Datums.
9 *
10 * In text/CSV mode, the parsing happens in multiple stages:
11 *
12 * [data source] --> raw_buf --> input_buf --> line_buf --> attribute_buf
13 * 1. 2. 3. 4.
14 *
15 * 1. CopyLoadRawBuf() reads raw data from the input file or client, and
16 * places it into 'raw_buf'.
17 *
18 * 2. CopyConvertBuf() calls the encoding conversion function to convert
19 * the data in 'raw_buf' from client to server encoding, placing the
20 * converted result in 'input_buf'.
21 *
22 * 3. CopyReadLine() parses the data in 'input_buf', one line at a time.
23 * It is responsible for finding the next newline marker, taking quote and
24 * escape characters into account according to the COPY options. The line
25 * is copied into 'line_buf', with quotes and escape characters still
26 * intact.
27 *
28 * 4. CopyReadAttributesText/CSV() function takes the input line from
29 * 'line_buf', and splits it into fields, unescaping the data as required.
30 * The fields are stored in 'attribute_buf', and 'raw_fields' array holds
31 * pointers to each field.
32 *
33 * If encoding conversion is not required, a shortcut is taken in step 2 to
34 * avoid copying the data unnecessarily. The 'input_buf' pointer is set to
35 * point directly to 'raw_buf', so that CopyLoadRawBuf() loads the raw data
36 * directly into 'input_buf'. CopyConvertBuf() then merely validates that
37 * the data is valid in the current encoding.
38 *
39 * In binary mode, the pipeline is much simpler. Input is loaded into
40 * 'raw_buf', and encoding conversion is done in the datatype-specific
41 * receive functions, if required. 'input_buf' and 'line_buf' are not used,
42 * but 'attribute_buf' is used as a temporary buffer to hold one attribute's
43 * data when it's passed the receive function.
44 *
45 * 'raw_buf' is always 64 kB in size (RAW_BUF_SIZE). 'input_buf' is also
46 * 64 kB (INPUT_BUF_SIZE), if encoding conversion is required. 'line_buf'
47 * and 'attribute_buf' are expanded on demand, to hold the longest line
48 * encountered so far.
49 *
50 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
51 * Portions Copyright (c) 1994, Regents of the University of California
52 *
53 *
54 * IDENTIFICATION
55 * src/backend/commands/copyfromparse.c
56 *
57 *-------------------------------------------------------------------------
58 */
59#include "postgres.h"
60
61#include <ctype.h>
62#include <unistd.h>
63#include <sys/stat.h>
64
65#include "commands/copyapi.h"
67#include "commands/progress.h"
68#include "executor/executor.h"
69#include "libpq/libpq.h"
70#include "libpq/pqformat.h"
71#include "mb/pg_wchar.h"
72#include "miscadmin.h"
73#include "pgstat.h"
74#include "port/pg_bswap.h"
75#include "utils/builtins.h"
76#include "utils/rel.h"
77
78#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
79#define OCTVALUE(c) ((c) - '0')
80
81/*
82 * These macros centralize code used to process line_buf and input_buf buffers.
83 * They are macros because they often do continue/break control and to avoid
84 * function call overhead in tight COPY loops.
85 *
86 * We must use "if (1)" because the usual "do {...} while(0)" wrapper would
87 * prevent the continue/break processing from working. We end the "if (1)"
88 * with "else ((void) 0)" to ensure the "if" does not unintentionally match
89 * any "else" in the calling code, and to avoid any compiler warnings about
90 * empty statements. See http://www.cit.gu.edu.au/~anthony/info/C/C.macros.
91 */
92
93/*
94 * This keeps the character read at the top of the loop in the buffer
95 * even if there is more than one read-ahead.
96 */
97#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen) \
98if (1) \
99{ \
100 if (input_buf_ptr + (extralen) >= copy_buf_len && !hit_eof) \
101 { \
102 input_buf_ptr = prev_raw_ptr; /* undo fetch */ \
103 need_data = true; \
104 continue; \
105 } \
106} else ((void) 0)
107
108/* This consumes the remainder of the buffer and breaks */
109#define IF_NEED_REFILL_AND_EOF_BREAK(extralen) \
110if (1) \
111{ \
112 if (input_buf_ptr + (extralen) >= copy_buf_len && hit_eof) \
113 { \
114 if (extralen) \
115 input_buf_ptr = copy_buf_len; /* consume the partial character */ \
116 /* backslash just before EOF, treat as data char */ \
117 result = true; \
118 break; \
119 } \
120} else ((void) 0)
121
122/*
123 * Transfer any approved data to line_buf; must do this to be sure
124 * there is some room in input_buf.
125 */
126#define REFILL_LINEBUF \
127if (1) \
128{ \
129 if (input_buf_ptr > cstate->input_buf_index) \
130 { \
131 appendBinaryStringInfo(&cstate->line_buf, \
132 cstate->input_buf + cstate->input_buf_index, \
133 input_buf_ptr - cstate->input_buf_index); \
134 cstate->input_buf_index = input_buf_ptr; \
135 } \
136} else ((void) 0)
137
138/* NOTE: there's a copy of this in copyto.c */
139static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
140
141
142/* non-export function prototypes */
143static bool CopyReadLine(CopyFromState cstate, bool is_csv);
144static bool CopyReadLineText(CopyFromState cstate, bool is_csv);
145static int CopyReadAttributesText(CopyFromState cstate);
146static int CopyReadAttributesCSV(CopyFromState cstate);
148 Oid typioparam, int32 typmod,
149 bool *isnull);
151 ExprContext *econtext,
152 Datum *values,
153 bool *nulls,
154 bool is_csv);
156 char ***fields,
157 int *nfields,
158 bool is_csv);
159
160
161/* Low-level communications functions */
162static int CopyGetData(CopyFromState cstate, void *databuf,
163 int minread, int maxread);
164static inline bool CopyGetInt32(CopyFromState cstate, int32 *val);
165static inline bool CopyGetInt16(CopyFromState cstate, int16 *val);
166static void CopyLoadInputBuf(CopyFromState cstate);
167static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes);
168
169void
171{
173 int natts = list_length(cstate->attnumlist);
174 int16 format = (cstate->opts.binary ? 1 : 0);
175 int i;
176
178 pq_sendbyte(&buf, format); /* overall format */
179 pq_sendint16(&buf, natts);
180 for (i = 0; i < natts; i++)
181 pq_sendint16(&buf, format); /* per-column formats */
183 cstate->copy_src = COPY_FRONTEND;
184 cstate->fe_msgbuf = makeStringInfo();
185 /* We *must* flush here to ensure FE knows it can send. */
186 pq_flush();
187}
188
189void
191{
192 char readSig[11];
193 int32 tmp;
194
195 /* Signature */
196 if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
197 memcmp(readSig, BinarySignature, 11) != 0)
200 errmsg("COPY file signature not recognized")));
201 /* Flags field */
202 if (!CopyGetInt32(cstate, &tmp))
205 errmsg("invalid COPY file header (missing flags)")));
206 if ((tmp & (1 << 16)) != 0)
209 errmsg("invalid COPY file header (WITH OIDS)")));
210 tmp &= ~(1 << 16);
211 if ((tmp >> 16) != 0)
214 errmsg("unrecognized critical flags in COPY file header")));
215 /* Header extension length */
216 if (!CopyGetInt32(cstate, &tmp) ||
217 tmp < 0)
220 errmsg("invalid COPY file header (missing length)")));
221 /* Skip extension header, if present */
222 while (tmp-- > 0)
223 {
224 if (CopyReadBinaryData(cstate, readSig, 1) != 1)
227 errmsg("invalid COPY file header (wrong length)")));
228 }
229}
230
231/*
232 * CopyGetData reads data from the source (file or frontend)
233 *
234 * We attempt to read at least minread, and at most maxread, bytes from
235 * the source. The actual number of bytes read is returned; if this is
236 * less than minread, EOF was detected.
237 *
238 * Note: when copying from the frontend, we expect a proper EOF mark per
239 * protocol; if the frontend simply drops the connection, we raise error.
240 * It seems unwise to allow the COPY IN to complete normally in that case.
241 *
242 * NB: no data conversion is applied here.
243 */
244static int
246{
247 int bytesread = 0;
248
249 switch (cstate->copy_src)
250 {
251 case COPY_FILE:
253 bytesread = fread(databuf, 1, maxread, cstate->copy_file);
255 if (ferror(cstate->copy_file))
258 errmsg("could not read from COPY file: %m")));
259 if (bytesread == 0)
260 cstate->raw_reached_eof = true;
261 break;
262 case COPY_FRONTEND:
263 while (maxread > 0 && bytesread < minread && !cstate->raw_reached_eof)
264 {
265 int avail;
266
267 while (cstate->fe_msgbuf->cursor >= cstate->fe_msgbuf->len)
268 {
269 /* Try to receive another message */
270 int mtype;
271 int maxmsglen;
272
276 mtype = pq_getbyte();
277 if (mtype == EOF)
280 errmsg("unexpected EOF on client connection with an open transaction")));
281 /* Validate message type and set packet size limit */
282 switch (mtype)
283 {
284 case PqMsg_CopyData:
286 break;
287 case PqMsg_CopyDone:
288 case PqMsg_CopyFail:
289 case PqMsg_Flush:
290 case PqMsg_Sync:
292 break;
293 default:
296 errmsg("unexpected message type 0x%02X during COPY from stdin",
297 mtype)));
298 maxmsglen = 0; /* keep compiler quiet */
299 break;
300 }
301 /* Now collect the message body */
302 if (pq_getmessage(cstate->fe_msgbuf, maxmsglen))
305 errmsg("unexpected EOF on client connection with an open transaction")));
307 /* ... and process it */
308 switch (mtype)
309 {
310 case PqMsg_CopyData:
311 break;
312 case PqMsg_CopyDone:
313 /* COPY IN correctly terminated by frontend */
314 cstate->raw_reached_eof = true;
315 return bytesread;
316 case PqMsg_CopyFail:
319 errmsg("COPY from stdin failed: %s",
320 pq_getmsgstring(cstate->fe_msgbuf))));
321 break;
322 case PqMsg_Flush:
323 case PqMsg_Sync:
324
325 /*
326 * Ignore Flush/Sync for the convenience of client
327 * libraries (such as libpq) that may send those
328 * without noticing that the command they just
329 * sent was COPY.
330 */
331 goto readmessage;
332 default:
333 Assert(false); /* NOT REACHED */
334 }
335 }
336 avail = cstate->fe_msgbuf->len - cstate->fe_msgbuf->cursor;
337 if (avail > maxread)
338 avail = maxread;
339 pq_copymsgbytes(cstate->fe_msgbuf, databuf, avail);
340 databuf = (char *) databuf + avail;
341 maxread -= avail;
342 bytesread += avail;
343 }
344 break;
345 case COPY_CALLBACK:
347 break;
348 }
349
350 return bytesread;
351}
352
353
354/*
355 * These functions do apply some data conversion
356 */
357
358/*
359 * CopyGetInt32 reads an int32 that appears in network byte order
360 *
361 * Returns true if OK, false if EOF
362 */
363static inline bool
365{
366 uint32 buf;
367
368 if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
369 {
370 *val = 0; /* suppress compiler warning */
371 return false;
372 }
373 *val = (int32) pg_ntoh32(buf);
374 return true;
375}
376
377/*
378 * CopyGetInt16 reads an int16 that appears in network byte order
379 */
380static inline bool
382{
383 uint16 buf;
384
385 if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
386 {
387 *val = 0; /* suppress compiler warning */
388 return false;
389 }
390 *val = (int16) pg_ntoh16(buf);
391 return true;
392}
393
394
395/*
396 * Perform encoding conversion on data in 'raw_buf', writing the converted
397 * data into 'input_buf'.
398 *
399 * On entry, there must be some data to convert in 'raw_buf'.
400 */
401static void
403{
404 /*
405 * If the file and server encoding are the same, no encoding conversion is
406 * required. However, we still need to verify that the input is valid for
407 * the encoding.
408 */
409 if (!cstate->need_transcoding)
410 {
411 /*
412 * When conversion is not required, input_buf and raw_buf are the
413 * same. raw_buf_len is the total number of bytes in the buffer, and
414 * input_buf_len tracks how many of those bytes have already been
415 * verified.
416 */
417 int preverifiedlen = cstate->input_buf_len;
418 int unverifiedlen = cstate->raw_buf_len - cstate->input_buf_len;
419 int nverified;
420
421 if (unverifiedlen == 0)
422 {
423 /*
424 * If no more raw data is coming, report the EOF to the caller.
425 */
426 if (cstate->raw_reached_eof)
427 cstate->input_reached_eof = true;
428 return;
429 }
430
431 /*
432 * Verify the new data, including any residual unverified bytes from
433 * previous round.
434 */
436 cstate->raw_buf + preverifiedlen,
438 if (nverified == 0)
439 {
440 /*
441 * Could not verify anything.
442 *
443 * If there is no more raw input data coming, it means that there
444 * was an incomplete multi-byte sequence at the end. Also, if
445 * there's "enough" input left, we should be able to verify at
446 * least one character, and a failure to do so means that we've
447 * hit an invalid byte sequence.
448 */
450 cstate->input_reached_error = true;
451 return;
452 }
453 cstate->input_buf_len += nverified;
454 }
455 else
456 {
457 /*
458 * Encoding conversion is needed.
459 */
460 int nbytes;
461 unsigned char *src;
462 int srclen;
463 unsigned char *dst;
464 int dstlen;
465 int convertedlen;
466
467 if (RAW_BUF_BYTES(cstate) == 0)
468 {
469 /*
470 * If no more raw data is coming, report the EOF to the caller.
471 */
472 if (cstate->raw_reached_eof)
473 cstate->input_reached_eof = true;
474 return;
475 }
476
477 /*
478 * First, copy down any unprocessed data.
479 */
480 nbytes = INPUT_BUF_BYTES(cstate);
481 if (nbytes > 0 && cstate->input_buf_index > 0)
482 memmove(cstate->input_buf, cstate->input_buf + cstate->input_buf_index,
483 nbytes);
484 cstate->input_buf_index = 0;
485 cstate->input_buf_len = nbytes;
486 cstate->input_buf[nbytes] = '\0';
487
488 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
489 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
490 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
491 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
492
493 /*
494 * Do the conversion. This might stop short, if there is an invalid
495 * byte sequence in the input. We'll convert as much as we can in
496 * that case.
497 *
498 * Note: Even if we hit an invalid byte sequence, we don't report the
499 * error until all the valid bytes have been consumed. The input
500 * might contain an end-of-input marker (\.), and we don't want to
501 * report an error if the invalid byte sequence is after the
502 * end-of-input marker. We might unnecessarily convert some data
503 * after the end-of-input marker as long as it's valid for the
504 * encoding, but that's harmless.
505 */
507 cstate->file_encoding,
509 src, srclen,
510 dst, dstlen,
511 true);
512 if (convertedlen == 0)
513 {
514 /*
515 * Could not convert anything. If there is no more raw input data
516 * coming, it means that there was an incomplete multi-byte
517 * sequence at the end. Also, if there is plenty of input left,
518 * we should be able to convert at least one character, so a
519 * failure to do so must mean that we've hit a byte sequence
520 * that's invalid.
521 */
523 cstate->input_reached_error = true;
524 return;
525 }
526 cstate->raw_buf_index += convertedlen;
527 cstate->input_buf_len += strlen((char *) dst);
528 }
529}
530
531/*
532 * Report an encoding or conversion error.
533 */
534static void
536{
537 Assert(cstate->raw_buf_len > 0);
539
540 if (!cstate->need_transcoding)
541 {
542 /*
543 * Everything up to input_buf_len was successfully verified, and
544 * input_buf_len points to the invalid or incomplete character.
545 */
547 cstate->raw_buf + cstate->input_buf_len,
548 cstate->raw_buf_len - cstate->input_buf_len);
549 }
550 else
551 {
552 /*
553 * raw_buf_index points to the invalid or untranslatable character. We
554 * let the conversion routine report the error, because it can provide
555 * a more specific error message than we could here. An earlier call
556 * to the conversion routine in CopyConvertBuf() detected that there
557 * is an error, now we call the conversion routine again with
558 * noError=false, to have it throw the error.
559 */
560 unsigned char *src;
561 int srclen;
562 unsigned char *dst;
563 int dstlen;
564
565 src = (unsigned char *) cstate->raw_buf + cstate->raw_buf_index;
566 srclen = cstate->raw_buf_len - cstate->raw_buf_index;
567 dst = (unsigned char *) cstate->input_buf + cstate->input_buf_len;
568 dstlen = INPUT_BUF_SIZE - cstate->input_buf_len + 1;
569
571 cstate->file_encoding,
573 src, srclen,
574 dst, dstlen,
575 false);
576
577 /*
578 * The conversion routine should have reported an error, so this
579 * should not be reached.
580 */
581 elog(ERROR, "encoding conversion failed without error");
582 }
583}
584
585/*
586 * Load more data from data source to raw_buf.
587 *
588 * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the
589 * beginning of the buffer, and we load new data after that.
590 */
591static void
593{
594 int nbytes;
595 int inbytes;
596
597 /*
598 * In text mode, if encoding conversion is not required, raw_buf and
599 * input_buf point to the same buffer. Their len/index better agree, too.
600 */
601 if (cstate->raw_buf == cstate->input_buf)
602 {
603 Assert(!cstate->need_transcoding);
604 Assert(cstate->raw_buf_index == cstate->input_buf_index);
605 Assert(cstate->input_buf_len <= cstate->raw_buf_len);
606 }
607
608 /*
609 * Copy down the unprocessed data if any.
610 */
611 nbytes = RAW_BUF_BYTES(cstate);
612 if (nbytes > 0 && cstate->raw_buf_index > 0)
613 memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
614 nbytes);
615 cstate->raw_buf_len -= cstate->raw_buf_index;
616 cstate->raw_buf_index = 0;
617
618 /*
619 * If raw_buf and input_buf are in fact the same buffer, adjust the
620 * input_buf variables, too.
621 */
622 if (cstate->raw_buf == cstate->input_buf)
623 {
624 cstate->input_buf_len -= cstate->input_buf_index;
625 cstate->input_buf_index = 0;
626 }
627
628 /* Load more data */
629 inbytes = CopyGetData(cstate, cstate->raw_buf + cstate->raw_buf_len,
630 1, RAW_BUF_SIZE - cstate->raw_buf_len);
631 nbytes += inbytes;
632 cstate->raw_buf[nbytes] = '\0';
633 cstate->raw_buf_len = nbytes;
634
635 cstate->bytes_processed += inbytes;
637
638 if (inbytes == 0)
639 cstate->raw_reached_eof = true;
640}
641
642/*
643 * CopyLoadInputBuf loads some more data into input_buf
644 *
645 * On return, at least one more input character is loaded into
646 * input_buf, or input_reached_eof is set.
647 *
648 * If INPUT_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
649 * of the buffer and then we load more data after that.
650 */
651static void
653{
654 int nbytes = INPUT_BUF_BYTES(cstate);
655
656 /*
657 * The caller has updated input_buf_index to indicate how much of the
658 * input has been consumed and isn't needed anymore. If input_buf is the
659 * same physical area as raw_buf, update raw_buf_index accordingly.
660 */
661 if (cstate->raw_buf == cstate->input_buf)
662 {
663 Assert(!cstate->need_transcoding);
664 Assert(cstate->input_buf_index >= cstate->raw_buf_index);
665 cstate->raw_buf_index = cstate->input_buf_index;
666 }
667
668 for (;;)
669 {
670 /* If we now have some unconverted data, try to convert it */
671 CopyConvertBuf(cstate);
672
673 /* If we now have some more input bytes ready, return them */
674 if (INPUT_BUF_BYTES(cstate) > nbytes)
675 return;
676
677 /*
678 * If we reached an invalid byte sequence, or we're at an incomplete
679 * multi-byte character but there is no more raw input data, report
680 * conversion error.
681 */
682 if (cstate->input_reached_error)
683 CopyConversionError(cstate);
684
685 /* no more input, and everything has been converted */
686 if (cstate->input_reached_eof)
687 break;
688
689 /* Try to load more raw data */
690 Assert(!cstate->raw_reached_eof);
691 CopyLoadRawBuf(cstate);
692 }
693}
694
695/*
696 * CopyReadBinaryData
697 *
698 * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
699 * and writes them to 'dest'. Returns the number of bytes read (which
700 * would be less than 'nbytes' only if we reach EOF).
701 */
702static int
703CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
704{
705 int copied_bytes = 0;
706
707 if (RAW_BUF_BYTES(cstate) >= nbytes)
708 {
709 /* Enough bytes are present in the buffer. */
710 memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
711 cstate->raw_buf_index += nbytes;
712 copied_bytes = nbytes;
713 }
714 else
715 {
716 /*
717 * Not enough bytes in the buffer, so must read from the file. Need
718 * to loop since 'nbytes' could be larger than the buffer size.
719 */
720 do
721 {
722 int copy_bytes;
723
724 /* Load more data if buffer is empty. */
725 if (RAW_BUF_BYTES(cstate) == 0)
726 {
727 CopyLoadRawBuf(cstate);
728 if (cstate->raw_reached_eof)
729 break; /* EOF */
730 }
731
732 /* Transfer some bytes. */
733 copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
734 memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
735 cstate->raw_buf_index += copy_bytes;
736 dest += copy_bytes;
738 } while (copied_bytes < nbytes);
739 }
740
741 return copied_bytes;
742}
743
744/*
745 * This function is exposed for use by extensions that read raw fields in the
746 * next line. See NextCopyFromRawFieldsInternal() for details.
747 */
748bool
749NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
750{
751 return NextCopyFromRawFieldsInternal(cstate, fields, nfields,
752 cstate->opts.csv_mode);
753}
754
755/*
756 * Workhorse for NextCopyFromRawFields().
757 *
758 * Read raw fields in the next line for COPY FROM in text or csv mode. Return
759 * false if no more lines.
760 *
761 * An internal temporary buffer is returned via 'fields'. It is valid until
762 * the next call of the function. Since the function returns all raw fields
763 * in the input file, 'nfields' could be different from the number of columns
764 * in the relation.
765 *
766 * NOTE: force_not_null option are not applied to the returned fields.
767 *
768 * We use pg_attribute_always_inline to reduce function call overhead
769 * and to help compilers to optimize away the 'is_csv' condition when called
770 * by internal functions such as CopyFromTextLikeOneRow().
771 */
773NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
774{
775 int fldct;
776 bool done = false;
777
778 /* only available for text or csv input */
779 Assert(!cstate->opts.binary);
780
781 /* on input check that the header line is correct if needed */
782 if (cstate->cur_lineno == 0 && cstate->opts.header_line != COPY_HEADER_FALSE)
783 {
784 ListCell *cur;
785 TupleDesc tupDesc;
786 int lines_to_skip = cstate->opts.header_line;
787
788 /* If set to "match", one header line is skipped */
789 if (cstate->opts.header_line == COPY_HEADER_MATCH)
790 lines_to_skip = 1;
791
792 tupDesc = RelationGetDescr(cstate->rel);
793
794 for (int i = 0; i < lines_to_skip; i++)
795 {
796 cstate->cur_lineno++;
797 if ((done = CopyReadLine(cstate, is_csv)))
798 break;
799 }
800
801 if (cstate->opts.header_line == COPY_HEADER_MATCH)
802 {
803 int fldnum;
804
805 if (is_csv)
807 else
809
810 if (fldct != list_length(cstate->attnumlist))
813 errmsg("wrong number of fields in header line: got %d, expected %d",
814 fldct, list_length(cstate->attnumlist))));
815
816 fldnum = 0;
817 foreach(cur, cstate->attnumlist)
818 {
819 int attnum = lfirst_int(cur);
820 char *colName;
821 Form_pg_attribute attr = TupleDescAttr(tupDesc, attnum - 1);
822
823 Assert(fldnum < cstate->max_fields);
824
825 colName = cstate->raw_fields[fldnum++];
826 if (colName == NULL)
829 errmsg("column name mismatch in header line field %d: got null value (\"%s\"), expected \"%s\"",
830 fldnum, cstate->opts.null_print, NameStr(attr->attname))));
831
832 if (namestrcmp(&attr->attname, colName) != 0)
833 {
836 errmsg("column name mismatch in header line field %d: got \"%s\", expected \"%s\"",
837 fldnum, colName, NameStr(attr->attname))));
838 }
839 }
840 }
841
842 if (done)
843 return false;
844 }
845
846 cstate->cur_lineno++;
847
848 /* Actually read the line into memory here */
849 done = CopyReadLine(cstate, is_csv);
850
851 /*
852 * EOF at start of line means we're done. If we see EOF after some
853 * characters, we act as though it was newline followed by EOF, ie,
854 * process the line and then exit loop on next iteration.
855 */
856 if (done && cstate->line_buf.len == 0)
857 return false;
858
859 /* Parse the line into de-escaped field values */
860 if (is_csv)
862 else
864
865 *fields = cstate->raw_fields;
866 *nfields = fldct;
867 return true;
868}
869
870/*
871 * Read next tuple from file for COPY FROM. Return false if no more tuples.
872 *
873 * 'econtext' is used to evaluate default expression for each column that is
874 * either not read from the file or is using the DEFAULT option of COPY FROM.
875 * It can be NULL when no default values are used, i.e. when all columns are
876 * read from the file, and DEFAULT option is unset.
877 *
878 * 'values' and 'nulls' arrays must be the same length as columns of the
879 * relation passed to BeginCopyFrom. This function fills the arrays.
880 */
881bool
883 Datum *values, bool *nulls)
884{
885 TupleDesc tupDesc;
887 num_defaults = cstate->num_defaults;
888 int i;
889 int *defmap = cstate->defmap;
890 ExprState **defexprs = cstate->defexprs;
891
892 tupDesc = RelationGetDescr(cstate->rel);
893 num_phys_attrs = tupDesc->natts;
894
895 /* Initialize all values for row to NULL */
896 MemSet(values, 0, num_phys_attrs * sizeof(Datum));
897 MemSet(nulls, true, num_phys_attrs * sizeof(bool));
898 MemSet(cstate->defaults, false, num_phys_attrs * sizeof(bool));
899
900 /* Get one row from source */
901 if (!cstate->routine->CopyFromOneRow(cstate, econtext, values, nulls))
902 return false;
903
904 /*
905 * Now compute and insert any defaults available for the columns not
906 * provided by the input data. Anything not processed here or above will
907 * remain NULL.
908 */
909 for (i = 0; i < num_defaults; i++)
910 {
911 /*
912 * The caller must supply econtext and have switched into the
913 * per-tuple memory context in it.
914 */
915 Assert(econtext != NULL);
917
918 values[defmap[i]] = ExecEvalExpr(defexprs[defmap[i]], econtext,
919 &nulls[defmap[i]]);
920 }
921
922 return true;
923}
924
925/* Implementation of the per-row callback for text format */
926bool
928 bool *nulls)
929{
930 return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, false);
931}
932
933/* Implementation of the per-row callback for CSV format */
934bool
936 bool *nulls)
937{
938 return CopyFromTextLikeOneRow(cstate, econtext, values, nulls, true);
939}
940
941/*
942 * Workhorse for CopyFromTextOneRow() and CopyFromCSVOneRow().
943 *
944 * We use pg_attribute_always_inline to reduce function call overhead
945 * and to help compilers to optimize away the 'is_csv' condition.
946 */
949 Datum *values, bool *nulls, bool is_csv)
950{
951 TupleDesc tupDesc;
953 FmgrInfo *in_functions = cstate->in_functions;
954 Oid *typioparams = cstate->typioparams;
955 ExprState **defexprs = cstate->defexprs;
956 char **field_strings;
957 ListCell *cur;
958 int fldct;
959 int fieldno;
960 char *string;
961
962 tupDesc = RelationGetDescr(cstate->rel);
964
965 /* read raw fields in the next line */
967 return false;
968
969 /* check for overflowing fields */
970 if (attr_count > 0 && fldct > attr_count)
973 errmsg("extra data after last expected column")));
974
975 fieldno = 0;
976
977 /* Loop to read the user attributes on the line. */
978 foreach(cur, cstate->attnumlist)
979 {
980 int attnum = lfirst_int(cur);
981 int m = attnum - 1;
982 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
983
984 if (fieldno >= fldct)
987 errmsg("missing data for column \"%s\"",
988 NameStr(att->attname))));
989 string = field_strings[fieldno++];
990
991 if (cstate->convert_select_flags &&
992 !cstate->convert_select_flags[m])
993 {
994 /* ignore input field, leaving column as NULL */
995 continue;
996 }
997
998 if (is_csv)
999 {
1000 if (string == NULL &&
1001 cstate->opts.force_notnull_flags[m])
1002 {
1003 /*
1004 * FORCE_NOT_NULL option is set and column is NULL - convert
1005 * it to the NULL string.
1006 */
1007 string = cstate->opts.null_print;
1008 }
1009 else if (string != NULL && cstate->opts.force_null_flags[m]
1010 && strcmp(string, cstate->opts.null_print) == 0)
1011 {
1012 /*
1013 * FORCE_NULL option is set and column matches the NULL
1014 * string. It must have been quoted, or otherwise the string
1015 * would already have been set to NULL. Convert it to NULL as
1016 * specified.
1017 */
1018 string = NULL;
1019 }
1020 }
1021
1022 cstate->cur_attname = NameStr(att->attname);
1023 cstate->cur_attval = string;
1024
1025 if (string != NULL)
1026 nulls[m] = false;
1027
1028 if (cstate->defaults[m])
1029 {
1030 /* We must have switched into the per-tuple memory context */
1031 Assert(econtext != NULL);
1033
1034 values[m] = ExecEvalExpr(defexprs[m], econtext, &nulls[m]);
1035 }
1036
1037 /*
1038 * If ON_ERROR is specified with IGNORE, skip rows with soft errors
1039 */
1040 else if (!InputFunctionCallSafe(&in_functions[m],
1041 string,
1042 typioparams[m],
1043 att->atttypmod,
1044 (Node *) cstate->escontext,
1045 &values[m]))
1046 {
1048
1049 cstate->num_errors++;
1050
1052 {
1053 /*
1054 * Since we emit line number and column info in the below
1055 * notice message, we suppress error context information other
1056 * than the relation name.
1057 */
1058 Assert(!cstate->relname_only);
1059 cstate->relname_only = true;
1060
1061 if (cstate->cur_attval)
1062 {
1063 char *attval;
1064
1067 errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": \"%s\"",
1068 cstate->cur_lineno,
1069 cstate->cur_attname,
1070 attval));
1071 pfree(attval);
1072 }
1073 else
1075 errmsg("skipping row due to data type incompatibility at line %" PRIu64 " for column \"%s\": null input",
1076 cstate->cur_lineno,
1077 cstate->cur_attname));
1078
1079 /* reset relname_only */
1080 cstate->relname_only = false;
1081 }
1082
1083 return true;
1084 }
1085
1086 cstate->cur_attname = NULL;
1087 cstate->cur_attval = NULL;
1088 }
1089
1091
1092 return true;
1093}
1094
1095/* Implementation of the per-row callback for binary format */
1096bool
1098 bool *nulls)
1099{
1100 TupleDesc tupDesc;
1102 FmgrInfo *in_functions = cstate->in_functions;
1103 Oid *typioparams = cstate->typioparams;
1105 ListCell *cur;
1106
1107 tupDesc = RelationGetDescr(cstate->rel);
1109
1110 cstate->cur_lineno++;
1111
1112 if (!CopyGetInt16(cstate, &fld_count))
1113 {
1114 /* EOF detected (end of file, or protocol-level EOF) */
1115 return false;
1116 }
1117
1118 if (fld_count == -1)
1119 {
1120 /*
1121 * Received EOF marker. Wait for the protocol-level EOF, and complain
1122 * if it doesn't come immediately. In COPY FROM STDIN, this ensures
1123 * that we correctly handle CopyFail, if client chooses to send that
1124 * now. When copying from file, we could ignore the rest of the file
1125 * like in text mode, but we choose to be consistent with the COPY
1126 * FROM STDIN case.
1127 */
1128 char dummy;
1129
1130 if (CopyReadBinaryData(cstate, &dummy, 1) > 0)
1131 ereport(ERROR,
1133 errmsg("received copy data after EOF marker")));
1134 return false;
1135 }
1136
1137 if (fld_count != attr_count)
1138 ereport(ERROR,
1140 errmsg("row field count is %d, expected %d",
1142
1143 foreach(cur, cstate->attnumlist)
1144 {
1145 int attnum = lfirst_int(cur);
1146 int m = attnum - 1;
1147 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1148
1149 cstate->cur_attname = NameStr(att->attname);
1150 values[m] = CopyReadBinaryAttribute(cstate,
1151 &in_functions[m],
1152 typioparams[m],
1153 att->atttypmod,
1154 &nulls[m]);
1155 cstate->cur_attname = NULL;
1156 }
1157
1158 return true;
1159}
1160
1161/*
1162 * Read the next input line and stash it in line_buf.
1163 *
1164 * Result is true if read was terminated by EOF, false if terminated
1165 * by newline. The terminating newline or EOF marker is not included
1166 * in the final value of line_buf.
1167 */
1168static bool
1170{
1171 bool result;
1172
1173 resetStringInfo(&cstate->line_buf);
1174 cstate->line_buf_valid = false;
1175
1176 /* Parse data and transfer into line_buf */
1177 result = CopyReadLineText(cstate, is_csv);
1178
1179 if (result)
1180 {
1181 /*
1182 * Reached EOF. In protocol version 3, we should ignore anything
1183 * after \. up to the protocol end of copy data. (XXX maybe better
1184 * not to treat \. as special?)
1185 */
1186 if (cstate->copy_src == COPY_FRONTEND)
1187 {
1188 int inbytes;
1189
1190 do
1191 {
1192 inbytes = CopyGetData(cstate, cstate->input_buf,
1193 1, INPUT_BUF_SIZE);
1194 } while (inbytes > 0);
1195 cstate->input_buf_index = 0;
1196 cstate->input_buf_len = 0;
1197 cstate->raw_buf_index = 0;
1198 cstate->raw_buf_len = 0;
1199 }
1200 }
1201 else
1202 {
1203 /*
1204 * If we didn't hit EOF, then we must have transferred the EOL marker
1205 * to line_buf along with the data. Get rid of it.
1206 */
1207 switch (cstate->eol_type)
1208 {
1209 case EOL_NL:
1210 Assert(cstate->line_buf.len >= 1);
1211 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1212 cstate->line_buf.len--;
1213 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1214 break;
1215 case EOL_CR:
1216 Assert(cstate->line_buf.len >= 1);
1217 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\r');
1218 cstate->line_buf.len--;
1219 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1220 break;
1221 case EOL_CRNL:
1222 Assert(cstate->line_buf.len >= 2);
1223 Assert(cstate->line_buf.data[cstate->line_buf.len - 2] == '\r');
1224 Assert(cstate->line_buf.data[cstate->line_buf.len - 1] == '\n');
1225 cstate->line_buf.len -= 2;
1226 cstate->line_buf.data[cstate->line_buf.len] = '\0';
1227 break;
1228 case EOL_UNKNOWN:
1229 /* shouldn't get here */
1230 Assert(false);
1231 break;
1232 }
1233 }
1234
1235 /* Now it's safe to use the buffer in error messages */
1236 cstate->line_buf_valid = true;
1237
1238 return result;
1239}
1240
1241/*
1242 * CopyReadLineText - inner loop of CopyReadLine for text mode
1243 */
1244static bool
1246{
1247 char *copy_input_buf;
1248 int input_buf_ptr;
1249 int copy_buf_len;
1250 bool need_data = false;
1251 bool hit_eof = false;
1252 bool result = false;
1253
1254 /* CSV variables */
1255 bool in_quote = false,
1256 last_was_esc = false;
1257 char quotec = '\0';
1258 char escapec = '\0';
1259
1260 if (is_csv)
1261 {
1262 quotec = cstate->opts.quote[0];
1263 escapec = cstate->opts.escape[0];
1264 /* ignore special escape processing if it's the same as quotec */
1265 if (quotec == escapec)
1266 escapec = '\0';
1267 }
1268
1269 /*
1270 * The objective of this loop is to transfer the entire next input line
1271 * into line_buf. Hence, we only care for detecting newlines (\r and/or
1272 * \n) and the end-of-copy marker (\.).
1273 *
1274 * In CSV mode, \r and \n inside a quoted field are just part of the data
1275 * value and are put in line_buf. We keep just enough state to know if we
1276 * are currently in a quoted field or not.
1277 *
1278 * The input has already been converted to the database encoding. All
1279 * supported server encodings have the property that all bytes in a
1280 * multi-byte sequence have the high bit set, so a multibyte character
1281 * cannot contain any newline or escape characters embedded in the
1282 * multibyte sequence. Therefore, we can process the input byte-by-byte,
1283 * regardless of the encoding.
1284 *
1285 * For speed, we try to move data from input_buf to line_buf in chunks
1286 * rather than one character at a time. input_buf_ptr points to the next
1287 * character to examine; any characters from input_buf_index to
1288 * input_buf_ptr have been determined to be part of the line, but not yet
1289 * transferred to line_buf.
1290 *
1291 * For a little extra speed within the loop, we copy input_buf and
1292 * input_buf_len into local variables.
1293 */
1294 copy_input_buf = cstate->input_buf;
1296 copy_buf_len = cstate->input_buf_len;
1297
1298 for (;;)
1299 {
1300 int prev_raw_ptr;
1301 char c;
1302
1303 /*
1304 * Load more data if needed.
1305 *
1306 * TODO: We could just force four bytes of read-ahead and avoid the
1307 * many calls to IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(). That was
1308 * unsafe with the old v2 COPY protocol, but we don't support that
1309 * anymore.
1310 */
1312 {
1314
1315 CopyLoadInputBuf(cstate);
1316 /* update our local variables */
1317 hit_eof = cstate->input_reached_eof;
1319 copy_buf_len = cstate->input_buf_len;
1320
1321 /*
1322 * If we are completely out of data, break out of the loop,
1323 * reporting EOF.
1324 */
1325 if (INPUT_BUF_BYTES(cstate) <= 0)
1326 {
1327 result = true;
1328 break;
1329 }
1330 need_data = false;
1331 }
1332
1333 /* OK to fetch a character */
1336
1337 if (is_csv)
1338 {
1339 /*
1340 * If character is '\r', we may need to look ahead below. Force
1341 * fetch of the next character if we don't already have it. We
1342 * need to do this before changing CSV state, in case '\r' is also
1343 * the quote or escape character.
1344 */
1345 if (c == '\r')
1346 {
1348 }
1349
1350 /*
1351 * Dealing with quotes and escapes here is mildly tricky. If the
1352 * quote char is also the escape char, there's no problem - we
1353 * just use the char as a toggle. If they are different, we need
1354 * to ensure that we only take account of an escape inside a
1355 * quoted field and immediately preceding a quote char, and not
1356 * the second in an escape-escape sequence.
1357 */
1358 if (in_quote && c == escapec)
1360 if (c == quotec && !last_was_esc)
1361 in_quote = !in_quote;
1362 if (c != escapec)
1363 last_was_esc = false;
1364
1365 /*
1366 * Updating the line count for embedded CR and/or LF chars is
1367 * necessarily a little fragile - this test is probably about the
1368 * best we can do. (XXX it's arguable whether we should do this
1369 * at all --- is cur_lineno a physical or logical count?)
1370 */
1371 if (in_quote && c == (cstate->eol_type == EOL_NL ? '\n' : '\r'))
1372 cstate->cur_lineno++;
1373 }
1374
1375 /* Process \r */
1376 if (c == '\r' && (!is_csv || !in_quote))
1377 {
1378 /* Check for \r\n on first line, _and_ handle \r\n. */
1379 if (cstate->eol_type == EOL_UNKNOWN ||
1380 cstate->eol_type == EOL_CRNL)
1381 {
1382 /*
1383 * If need more data, go back to loop top to load it.
1384 *
1385 * Note that if we are at EOF, c will wind up as '\0' because
1386 * of the guaranteed pad of input_buf.
1387 */
1389
1390 /* get next char */
1392
1393 if (c == '\n')
1394 {
1395 input_buf_ptr++; /* eat newline */
1396 cstate->eol_type = EOL_CRNL; /* in case not set yet */
1397 }
1398 else
1399 {
1400 /* found \r, but no \n */
1401 if (cstate->eol_type == EOL_CRNL)
1402 ereport(ERROR,
1404 !is_csv ?
1405 errmsg("literal carriage return found in data") :
1406 errmsg("unquoted carriage return found in data"),
1407 !is_csv ?
1408 errhint("Use \"\\r\" to represent carriage return.") :
1409 errhint("Use quoted CSV field to represent carriage return.")));
1410
1411 /*
1412 * if we got here, it is the first line and we didn't find
1413 * \n, so don't consume the peeked character
1414 */
1415 cstate->eol_type = EOL_CR;
1416 }
1417 }
1418 else if (cstate->eol_type == EOL_NL)
1419 ereport(ERROR,
1421 !is_csv ?
1422 errmsg("literal carriage return found in data") :
1423 errmsg("unquoted carriage return found in data"),
1424 !is_csv ?
1425 errhint("Use \"\\r\" to represent carriage return.") :
1426 errhint("Use quoted CSV field to represent carriage return.")));
1427 /* If reach here, we have found the line terminator */
1428 break;
1429 }
1430
1431 /* Process \n */
1432 if (c == '\n' && (!is_csv || !in_quote))
1433 {
1434 if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
1435 ereport(ERROR,
1437 !is_csv ?
1438 errmsg("literal newline found in data") :
1439 errmsg("unquoted newline found in data"),
1440 !is_csv ?
1441 errhint("Use \"\\n\" to represent newline.") :
1442 errhint("Use quoted CSV field to represent newline.")));
1443 cstate->eol_type = EOL_NL; /* in case not set yet */
1444 /* If reach here, we have found the line terminator */
1445 break;
1446 }
1447
1448 /*
1449 * Process backslash, except in CSV mode where backslash is a normal
1450 * character.
1451 */
1452 if (c == '\\' && !is_csv)
1453 {
1454 char c2;
1455
1458
1459 /* -----
1460 * get next character
1461 * Note: we do not change c so if it isn't \., we can fall
1462 * through and continue processing.
1463 * -----
1464 */
1466
1467 if (c2 == '.')
1468 {
1469 input_buf_ptr++; /* consume the '.' */
1470 if (cstate->eol_type == EOL_CRNL)
1471 {
1472 /* Get the next character */
1474 /* if hit_eof, c2 will become '\0' */
1476
1477 if (c2 == '\n')
1478 ereport(ERROR,
1480 errmsg("end-of-copy marker does not match previous newline style")));
1481 else if (c2 != '\r')
1482 ereport(ERROR,
1484 errmsg("end-of-copy marker is not alone on its line")));
1485 }
1486
1487 /* Get the next character */
1489 /* if hit_eof, c2 will become '\0' */
1491
1492 if (c2 != '\r' && c2 != '\n')
1493 ereport(ERROR,
1495 errmsg("end-of-copy marker is not alone on its line")));
1496
1497 if ((cstate->eol_type == EOL_NL && c2 != '\n') ||
1498 (cstate->eol_type == EOL_CRNL && c2 != '\n') ||
1499 (cstate->eol_type == EOL_CR && c2 != '\r'))
1500 ereport(ERROR,
1502 errmsg("end-of-copy marker does not match previous newline style")));
1503
1504 /*
1505 * If there is any data on this line before the \., complain.
1506 */
1507 if (cstate->line_buf.len > 0 ||
1508 prev_raw_ptr > cstate->input_buf_index)
1509 ereport(ERROR,
1511 errmsg("end-of-copy marker is not alone on its line")));
1512
1513 /*
1514 * Discard the \. and newline, then report EOF.
1515 */
1517 result = true; /* report EOF */
1518 break;
1519 }
1520 else
1521 {
1522 /*
1523 * If we are here, it means we found a backslash followed by
1524 * something other than a period. In non-CSV mode, anything
1525 * after a backslash is special, so we skip over that second
1526 * character too. If we didn't do that \\. would be
1527 * considered an eof-of copy, while in non-CSV mode it is a
1528 * literal backslash followed by a period.
1529 */
1530 input_buf_ptr++;
1531 }
1532 }
1533 } /* end of outer loop */
1534
1535 /*
1536 * Transfer any still-uncopied data to line_buf.
1537 */
1539
1540 return result;
1541}
1542
1543/*
1544 * Return decimal value for a hexadecimal digit
1545 */
1546static int
1548{
1549 if (isdigit((unsigned char) hex))
1550 return hex - '0';
1551 else
1552 return pg_ascii_tolower((unsigned char) hex) - 'a' + 10;
1553}
1554
1555/*
1556 * Parse the current line into separate attributes (fields),
1557 * performing de-escaping as needed.
1558 *
1559 * The input is in line_buf. We use attribute_buf to hold the result
1560 * strings. cstate->raw_fields[k] is set to point to the k'th attribute
1561 * string, or NULL when the input matches the null marker string.
1562 * This array is expanded as necessary.
1563 *
1564 * (Note that the caller cannot check for nulls since the returned
1565 * string would be the post-de-escaping equivalent, which may look
1566 * the same as some valid data string.)
1567 *
1568 * delim is the column delimiter string (must be just one byte for now).
1569 * null_print is the null marker string. Note that this is compared to
1570 * the pre-de-escaped input string.
1571 *
1572 * The return value is the number of fields actually read.
1573 */
1574static int
1576{
1577 char delimc = cstate->opts.delim[0];
1578 int fieldno;
1579 char *output_ptr;
1580 char *cur_ptr;
1581 char *line_end_ptr;
1582
1583 /*
1584 * We need a special case for zero-column tables: check that the input
1585 * line is empty, and return.
1586 */
1587 if (cstate->max_fields <= 0)
1588 {
1589 if (cstate->line_buf.len != 0)
1590 ereport(ERROR,
1592 errmsg("extra data after last expected column")));
1593 return 0;
1594 }
1595
1597
1598 /*
1599 * The de-escaped attributes will certainly not be longer than the input
1600 * data line, so we can just force attribute_buf to be large enough and
1601 * then transfer data without any checks for enough space. We need to do
1602 * it this way because enlarging attribute_buf mid-stream would invalidate
1603 * pointers already stored into cstate->raw_fields[].
1604 */
1605 if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1606 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1607 output_ptr = cstate->attribute_buf.data;
1608
1609 /* set pointer variables for loop */
1610 cur_ptr = cstate->line_buf.data;
1611 line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1612
1613 /* Outer loop iterates over fields */
1614 fieldno = 0;
1615 for (;;)
1616 {
1617 bool found_delim = false;
1618 char *start_ptr;
1619 char *end_ptr;
1620 int input_len;
1621 bool saw_non_ascii = false;
1622
1623 /* Make sure there is enough space for the next value */
1624 if (fieldno >= cstate->max_fields)
1625 {
1626 cstate->max_fields *= 2;
1627 cstate->raw_fields =
1628 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1629 }
1630
1631 /* Remember start of field on both input and output sides */
1633 cstate->raw_fields[fieldno] = output_ptr;
1634
1635 /*
1636 * Scan data for field.
1637 *
1638 * Note that in this loop, we are scanning to locate the end of field
1639 * and also speculatively performing de-escaping. Once we find the
1640 * end-of-field, we can match the raw field contents against the null
1641 * marker string. Only after that comparison fails do we know that
1642 * de-escaping is actually the right thing to do; therefore we *must
1643 * not* throw any syntax errors before we've done the null-marker
1644 * check.
1645 */
1646 for (;;)
1647 {
1648 char c;
1649
1650 end_ptr = cur_ptr;
1651 if (cur_ptr >= line_end_ptr)
1652 break;
1653 c = *cur_ptr++;
1654 if (c == delimc)
1655 {
1656 found_delim = true;
1657 break;
1658 }
1659 if (c == '\\')
1660 {
1661 if (cur_ptr >= line_end_ptr)
1662 break;
1663 c = *cur_ptr++;
1664 switch (c)
1665 {
1666 case '0':
1667 case '1':
1668 case '2':
1669 case '3':
1670 case '4':
1671 case '5':
1672 case '6':
1673 case '7':
1674 {
1675 /* handle \013 */
1676 int val;
1677
1678 val = OCTVALUE(c);
1679 if (cur_ptr < line_end_ptr)
1680 {
1681 c = *cur_ptr;
1682 if (ISOCTAL(c))
1683 {
1684 cur_ptr++;
1685 val = (val << 3) + OCTVALUE(c);
1686 if (cur_ptr < line_end_ptr)
1687 {
1688 c = *cur_ptr;
1689 if (ISOCTAL(c))
1690 {
1691 cur_ptr++;
1692 val = (val << 3) + OCTVALUE(c);
1693 }
1694 }
1695 }
1696 }
1697 c = val & 0377;
1698 if (c == '\0' || IS_HIGHBIT_SET(c))
1699 saw_non_ascii = true;
1700 }
1701 break;
1702 case 'x':
1703 /* Handle \x3F */
1704 if (cur_ptr < line_end_ptr)
1705 {
1706 char hexchar = *cur_ptr;
1707
1708 if (isxdigit((unsigned char) hexchar))
1709 {
1711
1712 cur_ptr++;
1713 if (cur_ptr < line_end_ptr)
1714 {
1715 hexchar = *cur_ptr;
1716 if (isxdigit((unsigned char) hexchar))
1717 {
1718 cur_ptr++;
1719 val = (val << 4) + GetDecimalFromHex(hexchar);
1720 }
1721 }
1722 c = val & 0xff;
1723 if (c == '\0' || IS_HIGHBIT_SET(c))
1724 saw_non_ascii = true;
1725 }
1726 }
1727 break;
1728 case 'b':
1729 c = '\b';
1730 break;
1731 case 'f':
1732 c = '\f';
1733 break;
1734 case 'n':
1735 c = '\n';
1736 break;
1737 case 'r':
1738 c = '\r';
1739 break;
1740 case 't':
1741 c = '\t';
1742 break;
1743 case 'v':
1744 c = '\v';
1745 break;
1746
1747 /*
1748 * in all other cases, take the char after '\'
1749 * literally
1750 */
1751 }
1752 }
1753
1754 /* Add c to output string */
1755 *output_ptr++ = c;
1756 }
1757
1758 /* Check whether raw input matched null marker */
1759 input_len = end_ptr - start_ptr;
1760 if (input_len == cstate->opts.null_print_len &&
1761 strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1762 cstate->raw_fields[fieldno] = NULL;
1763 /* Check whether raw input matched default marker */
1764 else if (fieldno < list_length(cstate->attnumlist) &&
1765 cstate->opts.default_print &&
1766 input_len == cstate->opts.default_print_len &&
1767 strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1768 {
1769 /* fieldno is 0-indexed and attnum is 1-indexed */
1770 int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1771
1772 if (cstate->defexprs[m] != NULL)
1773 {
1774 /* defaults contain entries for all physical attributes */
1775 cstate->defaults[m] = true;
1776 }
1777 else
1778 {
1779 TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1780 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1781
1782 ereport(ERROR,
1784 errmsg("unexpected default marker in COPY data"),
1785 errdetail("Column \"%s\" has no default value.",
1786 NameStr(att->attname))));
1787 }
1788 }
1789 else
1790 {
1791 /*
1792 * At this point we know the field is supposed to contain data.
1793 *
1794 * If we de-escaped any non-7-bit-ASCII chars, make sure the
1795 * resulting string is valid data for the db encoding.
1796 */
1797 if (saw_non_ascii)
1798 {
1799 char *fld = cstate->raw_fields[fieldno];
1800
1801 pg_verifymbstr(fld, output_ptr - fld, false);
1802 }
1803 }
1804
1805 /* Terminate attribute value in output area */
1806 *output_ptr++ = '\0';
1807
1808 fieldno++;
1809 /* Done if we hit EOL instead of a delim */
1810 if (!found_delim)
1811 break;
1812 }
1813
1814 /* Clean up state of attribute_buf */
1815 output_ptr--;
1816 Assert(*output_ptr == '\0');
1817 cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
1818
1819 return fieldno;
1820}
1821
1822/*
1823 * Parse the current line into separate attributes (fields),
1824 * performing de-escaping as needed. This has exactly the same API as
1825 * CopyReadAttributesText, except we parse the fields according to
1826 * "standard" (i.e. common) CSV usage.
1827 */
1828static int
1830{
1831 char delimc = cstate->opts.delim[0];
1832 char quotec = cstate->opts.quote[0];
1833 char escapec = cstate->opts.escape[0];
1834 int fieldno;
1835 char *output_ptr;
1836 char *cur_ptr;
1837 char *line_end_ptr;
1838
1839 /*
1840 * We need a special case for zero-column tables: check that the input
1841 * line is empty, and return.
1842 */
1843 if (cstate->max_fields <= 0)
1844 {
1845 if (cstate->line_buf.len != 0)
1846 ereport(ERROR,
1848 errmsg("extra data after last expected column")));
1849 return 0;
1850 }
1851
1853
1854 /*
1855 * The de-escaped attributes will certainly not be longer than the input
1856 * data line, so we can just force attribute_buf to be large enough and
1857 * then transfer data without any checks for enough space. We need to do
1858 * it this way because enlarging attribute_buf mid-stream would invalidate
1859 * pointers already stored into cstate->raw_fields[].
1860 */
1861 if (cstate->attribute_buf.maxlen <= cstate->line_buf.len)
1862 enlargeStringInfo(&cstate->attribute_buf, cstate->line_buf.len);
1863 output_ptr = cstate->attribute_buf.data;
1864
1865 /* set pointer variables for loop */
1866 cur_ptr = cstate->line_buf.data;
1867 line_end_ptr = cstate->line_buf.data + cstate->line_buf.len;
1868
1869 /* Outer loop iterates over fields */
1870 fieldno = 0;
1871 for (;;)
1872 {
1873 bool found_delim = false;
1874 bool saw_quote = false;
1875 char *start_ptr;
1876 char *end_ptr;
1877 int input_len;
1878
1879 /* Make sure there is enough space for the next value */
1880 if (fieldno >= cstate->max_fields)
1881 {
1882 cstate->max_fields *= 2;
1883 cstate->raw_fields =
1884 repalloc(cstate->raw_fields, cstate->max_fields * sizeof(char *));
1885 }
1886
1887 /* Remember start of field on both input and output sides */
1889 cstate->raw_fields[fieldno] = output_ptr;
1890
1891 /*
1892 * Scan data for field,
1893 *
1894 * The loop starts in "not quote" mode and then toggles between that
1895 * and "in quote" mode. The loop exits normally if it is in "not
1896 * quote" mode and a delimiter or line end is seen.
1897 */
1898 for (;;)
1899 {
1900 char c;
1901
1902 /* Not in quote */
1903 for (;;)
1904 {
1905 end_ptr = cur_ptr;
1906 if (cur_ptr >= line_end_ptr)
1907 goto endfield;
1908 c = *cur_ptr++;
1909 /* unquoted field delimiter */
1910 if (c == delimc)
1911 {
1912 found_delim = true;
1913 goto endfield;
1914 }
1915 /* start of quoted field (or part of field) */
1916 if (c == quotec)
1917 {
1918 saw_quote = true;
1919 break;
1920 }
1921 /* Add c to output string */
1922 *output_ptr++ = c;
1923 }
1924
1925 /* In quote */
1926 for (;;)
1927 {
1928 end_ptr = cur_ptr;
1929 if (cur_ptr >= line_end_ptr)
1930 ereport(ERROR,
1932 errmsg("unterminated CSV quoted field")));
1933
1934 c = *cur_ptr++;
1935
1936 /* escape within a quoted field */
1937 if (c == escapec)
1938 {
1939 /*
1940 * peek at the next char if available, and escape it if it
1941 * is an escape char or a quote char
1942 */
1943 if (cur_ptr < line_end_ptr)
1944 {
1945 char nextc = *cur_ptr;
1946
1947 if (nextc == escapec || nextc == quotec)
1948 {
1949 *output_ptr++ = nextc;
1950 cur_ptr++;
1951 continue;
1952 }
1953 }
1954 }
1955
1956 /*
1957 * end of quoted field. Must do this test after testing for
1958 * escape in case quote char and escape char are the same
1959 * (which is the common case).
1960 */
1961 if (c == quotec)
1962 break;
1963
1964 /* Add c to output string */
1965 *output_ptr++ = c;
1966 }
1967 }
1968endfield:
1969
1970 /* Terminate attribute value in output area */
1971 *output_ptr++ = '\0';
1972
1973 /* Check whether raw input matched null marker */
1974 input_len = end_ptr - start_ptr;
1975 if (!saw_quote && input_len == cstate->opts.null_print_len &&
1976 strncmp(start_ptr, cstate->opts.null_print, input_len) == 0)
1977 cstate->raw_fields[fieldno] = NULL;
1978 /* Check whether raw input matched default marker */
1979 else if (fieldno < list_length(cstate->attnumlist) &&
1980 cstate->opts.default_print &&
1981 input_len == cstate->opts.default_print_len &&
1982 strncmp(start_ptr, cstate->opts.default_print, input_len) == 0)
1983 {
1984 /* fieldno is 0-index and attnum is 1-index */
1985 int m = list_nth_int(cstate->attnumlist, fieldno) - 1;
1986
1987 if (cstate->defexprs[m] != NULL)
1988 {
1989 /* defaults contain entries for all physical attributes */
1990 cstate->defaults[m] = true;
1991 }
1992 else
1993 {
1994 TupleDesc tupDesc = RelationGetDescr(cstate->rel);
1995 Form_pg_attribute att = TupleDescAttr(tupDesc, m);
1996
1997 ereport(ERROR,
1999 errmsg("unexpected default marker in COPY data"),
2000 errdetail("Column \"%s\" has no default value.",
2001 NameStr(att->attname))));
2002 }
2003 }
2004
2005 fieldno++;
2006 /* Done if we hit EOL instead of a delim */
2007 if (!found_delim)
2008 break;
2009 }
2010
2011 /* Clean up state of attribute_buf */
2012 output_ptr--;
2013 Assert(*output_ptr == '\0');
2014 cstate->attribute_buf.len = (output_ptr - cstate->attribute_buf.data);
2015
2016 return fieldno;
2017}
2018
2019
2020/*
2021 * Read a binary attribute
2022 */
2023static Datum
2025 Oid typioparam, int32 typmod,
2026 bool *isnull)
2027{
2029 Datum result;
2030
2031 if (!CopyGetInt32(cstate, &fld_size))
2032 ereport(ERROR,
2034 errmsg("unexpected EOF in COPY data")));
2035 if (fld_size == -1)
2036 {
2037 *isnull = true;
2038 return ReceiveFunctionCall(flinfo, NULL, typioparam, typmod);
2039 }
2040 if (fld_size < 0)
2041 ereport(ERROR,
2043 errmsg("invalid field size")));
2044
2045 /* reset attribute_buf to empty, and load raw data in it */
2047
2049 if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
2050 fld_size) != fld_size)
2051 ereport(ERROR,
2053 errmsg("unexpected EOF in COPY data")));
2054
2055 cstate->attribute_buf.len = fld_size;
2056 cstate->attribute_buf.data[fld_size] = '\0';
2057
2058 /* Call the column type's binary input converter */
2059 result = ReceiveFunctionCall(flinfo, &cstate->attribute_buf,
2060 typioparam, typmod);
2061
2062 /* Trouble if it didn't eat the whole buffer */
2063 if (cstate->attribute_buf.cursor != cstate->attribute_buf.len)
2064 ereport(ERROR,
2066 errmsg("incorrect binary data format")));
2067
2068 *isnull = false;
2069 return result;
2070}
int16 AttrNumber
Definition attnum.h:21
void pgstat_progress_update_param(int index, int64 val)
static Datum values[MAXATTR]
Definition bootstrap.c:155
#define NameStr(name)
Definition c.h:765
#define Min(x, y)
Definition c.h:997
#define IS_HIGHBIT_SET(ch)
Definition c.h:1150
#define Assert(condition)
Definition c.h:873
#define pg_attribute_always_inline
Definition c.h:279
int16_t int16
Definition c.h:541
int32_t int32
Definition c.h:542
uint16_t uint16
Definition c.h:545
uint32_t uint32
Definition c.h:546
#define MemSet(start, val, len)
Definition c.h:1013
char * CopyLimitPrintoutLength(const char *str)
Definition copyfrom.c:333
#define RAW_BUF_BYTES(cstate)
#define INPUT_BUF_SIZE
@ EOL_CR
@ EOL_CRNL
@ EOL_UNKNOWN
@ EOL_NL
#define INPUT_BUF_BYTES(cstate)
#define RAW_BUF_SIZE
static pg_attribute_always_inline bool CopyFromTextLikeOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls, bool is_csv)
static pg_attribute_always_inline bool NextCopyFromRawFieldsInternal(CopyFromState cstate, char ***fields, int *nfields, bool is_csv)
bool CopyFromTextOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
bool CopyFromCSVOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
static int CopyReadAttributesCSV(CopyFromState cstate)
static bool CopyGetInt16(CopyFromState cstate, int16 *val)
static void CopyConversionError(CopyFromState cstate)
static bool CopyGetInt32(CopyFromState cstate, int32 *val)
static void CopyLoadRawBuf(CopyFromState cstate)
#define OCTVALUE(c)
#define REFILL_LINEBUF
static void CopyLoadInputBuf(CopyFromState cstate)
#define ISOCTAL(c)
void ReceiveCopyBinaryHeader(CopyFromState cstate)
static int CopyGetData(CopyFromState cstate, void *databuf, int minread, int maxread)
static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, Oid typioparam, int32 typmod, bool *isnull)
static int GetDecimalFromHex(char hex)
void ReceiveCopyBegin(CopyFromState cstate)
static bool CopyReadLineText(CopyFromState cstate, bool is_csv)
#define IF_NEED_REFILL_AND_EOF_BREAK(extralen)
static int CopyReadAttributesText(CopyFromState cstate)
static const char BinarySignature[11]
#define IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(extralen)
static bool CopyReadLine(CopyFromState cstate, bool is_csv)
static int CopyReadBinaryData(CopyFromState cstate, char *dest, int nbytes)
bool CopyFromBinaryOneRow(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
static void CopyConvertBuf(CopyFromState cstate)
bool NextCopyFrom(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
bool NextCopyFromRawFields(CopyFromState cstate, char ***fields, int *nfields)
@ COPY_FILE
Definition copyto.c:47
@ COPY_CALLBACK
Definition copyto.c:49
@ COPY_FRONTEND
Definition copyto.c:48
struct cursor * cur
Definition ecpg.c:29
int errcode_for_file_access(void)
Definition elog.c:886
int errdetail(const char *fmt,...)
Definition elog.c:1216
int errhint(const char *fmt,...)
Definition elog.c:1330
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define NOTICE
Definition elog.h:35
#define ereport(elevel,...)
Definition elog.h:150
static Datum ExecEvalExpr(ExprState *state, ExprContext *econtext, bool *isNull)
Definition executor.h:393
bool InputFunctionCallSafe(FmgrInfo *flinfo, char *str, Oid typioparam, int32 typmod, Node *escontext, Datum *result)
Definition fmgr.c:1585
Datum ReceiveFunctionCall(FmgrInfo *flinfo, StringInfo buf, Oid typioparam, int32 typmod)
Definition fmgr.c:1697
@ COPY_ON_ERROR_STOP
Definition copy.h:36
@ COPY_LOG_VERBOSITY_VERBOSE
Definition copy.h:48
#define COPY_HEADER_MATCH
Definition copy.h:26
#define COPY_HEADER_FALSE
Definition copy.h:27
long val
Definition informix.c:689
int i
Definition isn.c:77
#define pq_flush()
Definition libpq.h:46
#define PQ_SMALL_MESSAGE_LIMIT
Definition libpq.h:30
#define PQ_LARGE_MESSAGE_LIMIT
Definition libpq.h:31
int GetDatabaseEncoding(void)
Definition mbutils.c:1264
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
Definition mbutils.c:1559
int pg_do_encoding_conversion_buf(Oid proc, int src_encoding, int dest_encoding, unsigned char *src, int srclen, unsigned char *dest, int destlen, bool noError)
Definition mbutils.c:470
void report_invalid_encoding(int encoding, const char *mbstr, int len)
Definition mbutils.c:1701
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define HOLD_CANCEL_INTERRUPTS()
Definition miscadmin.h:142
#define RESUME_CANCEL_INTERRUPTS()
Definition miscadmin.h:144
int namestrcmp(Name name, const char *str)
Definition name.c:247
int16 attnum
FormData_pg_attribute * Form_pg_attribute
static char format
#define pg_ntoh32(x)
Definition pg_bswap.h:125
#define pg_ntoh16(x)
Definition pg_bswap.h:124
static int list_length(const List *l)
Definition pg_list.h:152
#define lfirst_int(lc)
Definition pg_list.h:173
static int list_nth_int(const List *list, int n)
Definition pg_list.h:310
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define MAX_CONVERSION_INPUT_LENGTH
Definition pg_wchar.h:320
static unsigned char pg_ascii_tolower(unsigned char ch)
Definition port.h:188
uint64_t Datum
Definition postgres.h:70
unsigned int Oid
int pq_getmessage(StringInfo s, int maxlen)
Definition pqcomm.c:1203
int pq_getbyte(void)
Definition pqcomm.c:963
void pq_startmsgread(void)
Definition pqcomm.c:1141
const char * pq_getmsgstring(StringInfo msg)
Definition pqformat.c:578
void pq_endmessage(StringInfo buf)
Definition pqformat.c:296
void pq_copymsgbytes(StringInfo msg, void *buf, int datalen)
Definition pqformat.c:527
void pq_beginmessage(StringInfo buf, char msgtype)
Definition pqformat.c:88
static void pq_sendbyte(StringInfo buf, uint8 byt)
Definition pqformat.h:160
static void pq_sendint16(StringInfo buf, uint16 i)
Definition pqformat.h:136
char * c
static int fb(int x)
char string[11]
#define PROGRESS_COPY_BYTES_PROCESSED
Definition progress.h:164
#define PqMsg_CopyDone
Definition protocol.h:64
#define PqMsg_CopyData
Definition protocol.h:65
#define PqMsg_CopyInResponse
Definition protocol.h:45
#define PqMsg_Sync
Definition protocol.h:27
#define PqMsg_CopyFail
Definition protocol.h:29
#define PqMsg_Flush
Definition protocol.h:24
#define RelationGetDescr(relation)
Definition rel.h:540
StringInfo makeStringInfo(void)
Definition stringinfo.c:72
void resetStringInfo(StringInfo str)
Definition stringinfo.c:126
void enlargeStringInfo(StringInfo str, int needed)
Definition stringinfo.c:337
int header_line
Definition copy.h:64
int default_print_len
Definition copy.h:70
int null_print_len
Definition copy.h:67
CopyLogVerbosityChoice log_verbosity
Definition copy.h:85
char * quote
Definition copy.h:72
CopyOnErrorChoice on_error
Definition copy.h:84
char * escape
Definition copy.h:73
char * null_print
Definition copy.h:66
char * delim
Definition copy.h:71
bool * force_notnull_flags
Definition copy.h:79
bool csv_mode
Definition copy.h:63
bool * force_null_flags
Definition copy.h:82
char * default_print
Definition copy.h:69
bool(* CopyFromOneRow)(CopyFromState cstate, ExprContext *econtext, Datum *values, bool *nulls)
Definition copyapi.h:96
copy_data_source_cb data_source_cb
const struct CopyFromRoutine * routine
StringInfoData line_buf
CopyFormatOptions opts
StringInfoData attribute_buf
const char * cur_attval
const char * cur_attname
ErrorSaveContext * escontext
MemoryContext ecxt_per_tuple_memory
Definition execnodes.h:283
Definition nodes.h:135
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:160
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
Definition wchar.c:2202
int pg_encoding_max_length(int encoding)
Definition wchar.c:2213