#!pip install industryDocumentsWrapper


import re
import os
import polars as pl
import pandas as pd
import zipfile
from industryDocumentsWrapper.ucsf_api import IndustryDocsSearch


wrapper = IndustryDocsSearch()
wrapper.query( 
              industry='tobacco', 
              case='State of North Carolina',
              collection='JUUL labs Collection',
              doc_type='email',
              n=500)
wrapper.save('query_results.json', format='json')

100/500 documents collected
200/500 documents collected
300/500 documents collected
400/500 documents collected
500/500 documents collected


docs_df = pl.read_json('query_results.json')


docs_df.sample(3)


def get_ocr_content(df, zip_dir):
    id_list = df['id'].str.strip_chars().to_list()
    main_df = pl.DataFrame()

    with zipfile.ZipFile(zip_dir, 'r') as zipf:
        for idx, file in enumerate(zipf.namelist()):
            with zipf.open(file) as f:
                try:
                    temp_df = pl.read_csv(f, separator='|')
                    temp_df = temp_df.filter(pl.col('id').cast(pl.String).str.strip_chars().is_in(id_list))
                    main_df = pl.concat([main_df, temp_df])
                except Exception as e:
                    print(f'Error processing file {file}: {e}')
                    continue
            print(f'CSVs Checked: {idx+1}/{len(zipf.namelist())} | Docs Matched: {len(main_df)}/{len(id_list)}', end='\r')

    return main_df


zip_dir = '../JUUL_Labs_Collection.zip'
df_pl = get_ocr_content(docs_df, zip_dir)

Error processing file JUUL_Labs_Collection_56.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_172.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_195.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_197.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_214.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_296.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_515.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_717.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_778.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_1029.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_1675.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_1733.csv: type String is incompatible with expected type Int64
Error processing file JUUL_Labs_Collection_1850.csv: type String is incompatible with expected type Int64
CSVs Checked: 2158/2158 | Docs Matched: 490/500


df_pl.sample(3)


df_pl.write_parquet("juul_dataset.parquet")

id	collection	collectioncode	custodian	availability	source	datesent	redactedby	datereceived	filename	filepath	case	title	author	documentdate	type	pages	recipient	brand	bates	redacted	dateaddeducsf	url	topic	copied	attachment
str	list[str]	list[str]	list[str]	list[str]	str	str	list[str]	str	str	list[str]	list[str]	str	list[str]	str	list[str]	i64	list[str]	list[str]	str	str	str	str	str	list[str]	list[str]
"fkmp0290"	["JUUL Labs Collection"]	["juul"]	["Vose, Josh"]	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	"2018 August 21"	["UCSF"]	null	"RE: PROT-00008 DM Agenda Topic…	["\Vose, Josh\Josh_Vose_GDrive_1-1-2019_to_9-30-2019_0\Sent_1Mg5PnITKFIsTo7J_IVX-acdFzQIa3nia.mbox\RE: PROT-00008 DM Agenda Topics & Metrics- 24 AUG 18", "\Vose, Josh\Josh_Vose_GDrive_1-1-2019_to_9-30-2019_0\Archived_1Nqvm9J4d1UKDnmJgqX9zuBeMSI1JtVOS.mbox\RE: PROT-00008 DM Agenda Topics & Metrics- 24 AUG 18"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	"RE: PROT-00008 DM Agenda Topic…	["Scott Stange <sstange@araonline.net>"]	"2018 August 21"	["email"]	3	["Ann Suhadolnik <asuhadolnik@juul.com>", "Srinivasa Batchu <srinivasa@juul.com>", … "Maria Walters <maria@juul.com>"]	["Juul"]	"JLI09513077"	"yes"	"2024 February 29"	"https://www.industrydocuments.…	null	["Ben Adeyi <badeyi@juul.com>", "Concetta Carbonaro <concetta@juul.com>", … "Lauren Intagliata <lintagliata@araonline.net>"]	null
"fncw0300"	["JUUL Labs Collection"]	["juul"]	["Burbidge, Cole"]	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	"2019 March 05"	null	"2019 March 05"	null	["\Burbidge, Cole\Cole Burbidge_Email\Cole_Burbidge_Email_cburbidge@juul.com_0.pst\Top of Personal Folders\Inbox\Automatic reply: Suggestions for data collection"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	"Automatic reply: Suggestions f…	["Calogera McCormick on behalf of Calogera McCormick <calogera.mccormick©synchrogenix.com>"]	"2019 March 05"	["email"]	1	["Cole Burbidge"]	["Juul"]	"JLI08275701"	"yes"	"2024 April 25"	"https://www.industrydocuments.…	null	null	null
"fffn0291"	["JUUL Labs Collection"]	["juul"]	["Sillin, Nat"]	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	"2019 April 22"	["UCSF"]	null	"Re: Signature gathering"	["\Sillin, Nat\Nat Sillin_Email_4-1-2019 to 4-1-2020\Nat_Sillin_Email_4-1-2019_to_4-1-2020--nat@juul.com_0 .mbox\1631644364172841357-4ef8cc17-9ac3-42f3-b6e3-4f4de50c3daf.mbox.eml\Re: Signature gathering", "\Sillin, Nat\Nathaniel Sillin_Email_2-1-2019 through 4-30-2019\Nathaniel_Sillin_Email_2-1-2019_through_4-30-2019--nat@juul.com_0 .mbox\1631644364172841357-c1b7a267-9a56-4a58-b5bd-2f761dc9089c.mbox.eml\Re: Signature gathering", "\Sillin, Nat\Nathaniel_Sillin_Email_4-4-2019_to_8-12-2019_nat@juul.com_0 .mbox\1631644364172841357-67c42dc1-20b4-4020-aaa5-883c49ce235c.mbox.eml\Re: Signature gathering"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	"Re: Signature gathering"	["John Whitehurst <johnw@bmwl.net>"]	"2019 April 22"	["email"]	2	["Nat Sillin <nat@juul.com>"]	["Juul"]	"JLI09048249"	"yes"	"2024 March 28"	"https://www.industrydocuments.…	null	["Mark Mosher <markm@bmwl.net>"]	null

id	tid	bates	type	description	title	author	mentioned	attending	copied	recipient	redacted	collection_name	pages	exhibit_number	document_date	date_added_ucsf	date_modified_ucsf	date_added_industry	date_modified_industry	date_produced	date_shipped	deposition_date	date_privilege_logged	case	industry	drug	adverse_ruling	area	bates_alternate	box	brand	country	language	court	format	express_waiver	file	genre	keywords	bates_master	other_number	request_number	minnesota_request_number	privilege_code	topic	witness	cited	availability	grant_number	source	folder	series	chemical	food	rights	attachment	attachmentnum	conversation	conversationid	custodian	datereceived	datesent	filename	filepath	messageid	subject	timereceived	timesent	redaction	ocr_text
str	str	str	str	str	str	str	str	str	str	str	str	str	i64	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str
"fngc0286"	" "	"JLI04844816"	"email"	" "	"Curious for a new epicurean ad…	"Curio Collection by Hilton <cu…	" "	" "	" "	"julie@juul.com"	" "	"JUUL Labs Collection"	2	" "	"Mon May 06 17:00:00 PDT 2019"	"Wed Jan 24 16:00:00 PST 2024"	"Wed Jan 24 16:00:00 PST 2024"	" "	" "	" "	" "	" "	" "	"State of North Carolina, ex re…	"Tobacco"	" "	" "	" "	" "	" "	"Juul"	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"public, no restrictions"	" "	"{"type":"plaintext","title":"U…	" "	" "	" "	" "	" "	" "	" "	" "	" "	"Henderson, Julie"	" "	"2019 May 07"	"Curious for a new epicurean ad…	"\Henderson, Julie\Julie Hender…	" "	" "	" "	" "	" "	"From: To: Sent: Subject: …
"fhkc0297"	" "	"JLI31611005"	"email"	" "	"Re: Parallel pathing it?"	"Tom Kirchner <tom.kirchner@nyu…	" "	" "	"Adam Bowen <adam@pax.com>"	"Gal Cohen <gal@pax.com>"	" "	"JUUL Labs Collection"	2	" "	"Sun Dec 18 16:00:00 PST 2016"	"Wed Mar 27 17:00:00 PDT 2024"	"Wed Mar 27 17:00:00 PDT 2024"	" "	" "	" "	" "	" "	" "	"State of North Carolina, ex re…	"Tobacco"	" "	" "	" "	" "	" "	"Juul"	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"Government & Public Affairs"	" "	" "	"public, no restrictions"	" "	"{"type":"plaintext","title":"U…	" "	" "	" "	" "	" "	" "	" "	" "	" "	"JLI"	" "	"2016 December 19"	"Re: Parallel pathing it?"	"\JLI\20200713 KOL Email Addres…	" "	" "	" "	" "	" "	"From: To: CC: Sent: Subjec…
"fhhd0314"	" "	"JLI10555255"	"email"	" "	"Re: JUUL Store Lists"	"Alicia Grossman <agrossman@juu…	" "	" "	"Alex Cantwell <"alex cantwell …	"Tania Goulart <"tania goulart …	"yes"	"JUUL Labs Collection"	6	" "	"Wed Apr 25 17:00:00 PDT 2018"	"Wed Jul 24 17:00:00 PDT 2024"	"Wed Jul 24 17:00:00 PDT 2024"	" "	" "	" "	" "	" "	" "	"State of North Carolina, ex re…	"Tobacco"	" "	" "	" "	" "	" "	"Juul"	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"public, no restrictions"	" "	"{"type":"plaintext","title":"U…	" "	" "	" "	" "	" "	" "	" "	" "	" "	"Cantwell, Alexander"	"2018 April 26"	"2018 April 26"	"Re: JUUL Store Lists"	"\Cantwell, Alexander\Alexander…	" "	" "	" "	" "	" "	"From: To: CC: Sent: Subjec…