Search This Blog

Showing posts with label Pivotal Greenplum. Show all posts
Showing posts with label Pivotal Greenplum. Show all posts

Thursday, 4 September 2014

Variable in list with Postgres JDBC and Greenplum

I previously blogged on how to create a variable JDBC IN list with Oracle. Here is how you would do it with Pivotal Greenplum. Much easier , without having to write a function. In the Greenplum demo below we use the any function combined with string_to_array

http://theblasfrompas.blogspot.com.au/2008/02/variable-in-list-with-oracle-jdbc-and.html

Code as follows
  
import java.sql.*;
import java.sql.DriverManager;

/**
 * Created by papicella on 4/09/2014.
 */
public class VariableInListGreenplum
{

    public VariableInListGreenplum()
    {
    }

    private Connection getConnection() throws SQLException, ClassNotFoundException
    {
        Class.forName("org.postgresql.Driver");
        Connection conn = null;
        conn = DriverManager.getConnection(
                "jdbc:postgresql://127.0.0.1:5432/apples","pas", "pas");

        return conn;
    }

    public void run() throws SQLException
    {
        Connection conn = null;
        PreparedStatement stmt = null;
        ResultSet rset = null;
        String queryInList =
                        "SELECT DEPTNO, " +
                        "       DNAME, " +
                        "       LOC " +
                        "FROM   scott.DEPT " +
                        "WHERE DEPTNO = any(string_to_array(?,', ')) ";

        try
        {
            conn = getConnection();
            stmt = conn.prepareStatement(queryInList);
            stmt.setString(1, "10, 20, 30");
            rset = stmt.executeQuery();

            while (rset.next())
            {
                System.out.println("Dept [" + rset.getInt(1) + ", " +
                                              rset.getString(2) + "]");
            }
        }
        catch (Exception e)
        {
            System.out.println("Exception occurred");
            e.printStackTrace();
        }
        finally
        {
            if (conn != null)
            {
                conn.close();
            }

            if (stmt != null)
            {
                stmt.close();
            }

            if (rset != null)
            {
                rset.close();
            }
        }
    }

    public static void main(String[] args) throws Exception
    {
        VariableInListGreenplum test = new VariableInListGreenplum();
        test.run();
    }
}

Wednesday, 9 April 2014

Pivotal Greenplum GPLOAD with multiple CSV files

I recently needed to setup a cron script which loaded CSV files from a directory into Greenplum every 2 minutes. Once loaded the files are moved onto Hadoop for archive purposes. The config below shows how to use GPLOAD data load utility which utilises GPFDIST.

1. Create a load table. In this example the data is then moved to the FACT table once the load is complete
  
drop table rtiadmin.rtitrans_etl4;

CREATE TABLE rtiadmin.rtitrans_etl4 (
    imsi character varying(82),
    subscriber_mccmnc character varying(10),
    msisdn character varying(82),
    imei character varying(50),
    called_digits character varying(50),
    start_datetime integer,
    end_datetime integer,
    first_cell_lac integer,
    first_cell_idsac integer,
    current_cell_lac integer,
    current_cell_idsac integer,
    dr_type integer,
    status character varying(50),
    ingest_time bigint,
    processed_time bigint,
    export_time bigint,
    extra_col text,
    gploaded_time timestamp without time zone
)
WITH (appendonly=true) DISTRIBUTED BY (imsi); 

2. GPLOAD yaml file defined as follows

VERSION: 1.0.0.1
DATABASE: mydb
USER: rtiadmin
HOST: 172.1.1.1
PORT: 5432
GPLOAD:
   INPUT:
    - SOURCE:
         LOCAL_HOSTNAME:
            - loadhost
         PORT: 8100
         FILE:
          - /data/rti/stage/run/*.csv
    - COLUMNS:
          - imsi : text
          - subscriber_mccmnc : text
          - msisdn : text
          - imei : text
          - called_digits : text
          - start_datetime : text
          - end_datetime : text
          - first_cell_lac : integer
          - first_cell_idsac : integer
          - current_cell_lac : integer
          - current_cell_idsac : integer
          - dr_type : integer
          - status : text
          - ingest_time : bigint
          - processed_time : bigint
          - export_time : bigint
          - extra_col : text
    - FORMAT: text
    - HEADER: false
    - DELIMITER: ','
    - NULL_AS : ''
    - ERROR_LIMIT: 999999
    - ERROR_TABLE: rtiadmin.rtitrans_etl4_err
   OUTPUT:
    - TABLE: rtiadmin.rtitrans_etl4
    - MODE: INSERT
    - MAPPING:
           imsi : imsi
           subscriber_mccmnc : subscriber_mccmnc
           msisdn : msisdn
           imei : imei
           called_digits : called_digits
           start_datetime : substr(start_datetime, 1, 10)::int
           end_datetime : substr(end_datetime, 1, 10)::int
           first_cell_lac : first_cell_lac
           first_cell_idsac : first_cell_idsac
           current_cell_lac : current_cell_lac
           current_cell_idsac : current_cell_idsac
           dr_type : dr_type
           status : status
           ingest_time : ingest_time
           processed_time : processed_time
           export_time : export_time
           extra_col : extra_col
           gploaded_time : current_timestamp
   PRELOAD:
    - TRUNCATE : true 
    - REUSE_TABLES : true
   SQL:
    - AFTER : "insert into rtitrans select * from rtitrans_etl4" 

3. Call GPLOAD as follows

source $HOME/.bash_profile
gpload -f rtidata.yml


Note: We use the ENV variable as $PGPASSWORD which is used during the load if a password is required which was in this demo

Few things worth noting here.

REUSE_TABLES : This ensures the external tables created during the load are maintained and re-used on next load.

TRUNCATE: This clears the load table prior to load and we use this as we COPY the data once the load is finished into the main FACT table using the "AFTER"