import os
import requests
import polars as pl
from industryDocumentsWrapper import IndustryDocsSearch

wrapper = IndustryDocsSearch()
wrapper.query(q="type:'unknown' AND box:'GLPVD9GQP'", n=-1)
wrapper.save('messages_results.json', format='json')

10/10 documents collected

len(wrapper.results)

10

df = pl.read_json('messages_results.json')

print(set(df["box"]))

{'GLPVD9GQP'}

df.head()

messages_df = pl.read_json('messages_results.json')
docs_df = pl.read_parquet('../../data/juul_unc_unknown.parquet')

filtered_messages = docs_df.filter(pl.col("id").is_in(messages_df["id"]))

filtered_messages['box'].unique()

# order messages by bates
sorted_messages = filtered_messages.sort('bates')

sorted_messages.head(3)

# Stitches the conversation within box: DAWLDENGP
import re

def stitch_conversation(df: pl.DataFrame):
    
    conversation = []
    for row in df.iter_rows(named=True):
        date = row.get('document_date')
        author = row.get('author')
        content = row.get('ocr_text')
        
        # clean the content to remove CONFIDENTIAL and extra numbers so it just shows the slack message
        cleaned_content = re.sub(r"(HIGHLY)? CONFIDENTIAL\s+NC-JLI-Consent Judgment\s+JLI?\d+", "", content).strip()

        # format as Date - Author - Content
        conversation.append(f"{date} -- {author} -- {cleaned_content}\n")

    return "\n".join(conversation)

conversation = stitch_conversation(sorted_messages)
print(conversation)

Sun Jul 21 17:00:00 PDT 2019 -- Deborah Tham -- yep - thanks - for the third question around automated processing, I am concerned that Leo is raising this just now

Sun Jul 21 17:00:00 PDT 2019 -- Deborah Tham -- If we are deemed "automated processing" under GDPR, this could require changes to our product- of which we have no time to implement without slipping

Sun Jul 21 17:00:00 PDT 2019 -- Deborah Tham -- my question is age verification fully automated or does it require manual intervention?

Sun Jul 21 17:00:00 PDT 2019 -- Amy Ding -- Sorry Deb, just saw that my response to your last e-mail got stuck in draft mode.  Was wondering where it went.

Sun Jul 21 17:00:00 PDT 2019 -- Amy Ding -- Age Verification is fully automated on our side, but our current vendor JUMIO employs manual review on each and every transaction in addition to their algorithmic checks

Sun Jul 21 17:00:00 PDT 2019 -- Deborah Tham -- phew

  -- Deborah Tham -- thanks so much

Sun Jul 21 17:00:00 PDT 2019 -- Amy Ding -- Our Compliance team also selectivelymanually  reviews and audits transactions

Sun Jul 21 17:00:00 PDT 2019 -- Eadon Jacobs -- to build on amy’s answer, ID verification is not the only solution for ecomm.  We also use Veratad which is a fully automated solution (where we match on data points, not ID upload)

Sun Jul 21 17:00:00 PDT 2019 -- Deborah Tham -- Under GDPR if we employ a fully automated process we may have additional requirements

# save as text file
def save_conversation_to_file(df:pl.DataFrame, output_file: str):
    conversation = stitch_conversation(df)
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(conversation)

save_conversation_to_file(sorted_messages, f"chat_{sorted_messages['box'][0]}.txt")

id	collection	collectioncode	box	availability	source	filepath	case	author	documentdate	type	pages	recipient	brand	bates	dateaddeducsf
str	list[str]	list[str]	str	list[str]	str	list[str]	list[str]	list[str]	str	list[str]	i64	list[str]	list[str]	str	str
"kqkk0323"	["JUUL Labs Collection"]	["juul"]	"GLPVD9GQP"	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	["Multiparty Messages\GLPVD9GQP"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	["Deborah Tham"]	"2019 July 22"	["unknown"]	1	["Amy Ding", "Deborah Tham", … "Sarah Ellinger"]	["Juul"]	"JLI50301949"	"2024 September 26"
"lqkk0323"	["JUUL Labs Collection"]	["juul"]	"GLPVD9GQP"	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	["Multiparty Messages\GLPVD9GQP"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	["Deborah Tham"]	"2019 July 22"	["unknown"]	1	["Amy Ding", "Deborah Tham", … "Sarah Ellinger"]	["Juul"]	"JLI50301950"	"2024 September 26"
"mqkk0323"	["JUUL Labs Collection"]	["juul"]	"GLPVD9GQP"	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	["Multiparty Messages\GLPVD9GQP"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	["Deborah Tham"]	"2019 July 22"	["unknown"]	1	["Amy Ding", "Deborah Tham", … "Sarah Ellinger"]	["Juul"]	"JLI50301951"	"2024 September 26"
"pqkk0323"	["JUUL Labs Collection"]	["juul"]	"GLPVD9GQP"	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	["Multiparty Messages\GLPVD9GQP"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	["Amy Ding"]	"2019 July 22"	["unknown"]	1	["Amy Ding", "Deborah Tham", … "Sarah Ellinger"]	["Juul"]	"JLI50301954"	"2024 September 26"
"qqkk0323"	["JUUL Labs Collection"]	["juul"]	"GLPVD9GQP"	["public", "no restrictions"]	"[{"type":"plaintext","title":"…	["Multiparty Messages\GLPVD9GQP"]	["State of North Carolina, ex rel. Joshua H. Stein, Attorney General, v. JUUL Labs, Inc"]	["Deborah Tham"]	"2019 July 22"	["unknown"]	1	["Amy Ding", "Deborah Tham", … "Sarah Ellinger"]	["Juul"]	"JLI50301955"	"2024 September 26"

id	tid	bates	type	description	title	author	mentioned	attending	copied	recipient	redacted	collection_name	pages	exhibit_number	document_date	date_added_ucsf	date_modified_ucsf	date_added_industry	date_modified_industry	date_produced	date_shipped	deposition_date	date_privilege_logged	case	industry	drug	adverse_ruling	area	bates_alternate	box	brand	country	language	court	format	express_waiver	file	genre	keywords	bates_master	other_number	request_number	minnesota_request_number	privilege_code	topic	witness	cited	availability	grant_number	source	folder	series	chemical	food	rights	attachment	attachmentnum	conversation	conversationid	custodian	datereceived	datesent	filename	filepath	messageid	subject	timereceived	timesent	redaction	ocr_text
str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str	str
"kqkk0323"	" "	"JLI50301949"	"unknown"	" "	" "	"Deborah Tham"	" "	" "	" "	"Amy Ding, Deborah Tham, Eadon …	" "	"JUUL Labs Collection"	"1"	" "	"Sun Jul 21 17:00:00 PDT 2019"	"Wed Sep 25 17:00:00 PDT 2024"	"Wed Sep 25 17:00:00 PDT 2024"	" "	" "	" "	" "	" "	" "	"State of North Carolina, ex re…	"Tobacco"	" "	" "	" "	" "	"GLPVD9GQP"	"Juul"	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"public, no restrictions"	" "	"{"type":"plaintext","title":"U…	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"Multiparty Messages\GLPVD9GQP"	" "	" "	" "	" "	" "	"yep - thanks - for the third q…
"lqkk0323"	" "	"JLI50301950"	"unknown"	" "	" "	"Deborah Tham"	" "	" "	" "	"Amy Ding, Deborah Tham, Eadon …	" "	"JUUL Labs Collection"	"1"	" "	"Sun Jul 21 17:00:00 PDT 2019"	"Wed Sep 25 17:00:00 PDT 2024"	"Wed Sep 25 17:00:00 PDT 2024"	" "	" "	" "	" "	" "	" "	"State of North Carolina, ex re…	"Tobacco"	" "	" "	" "	" "	"GLPVD9GQP"	"Juul"	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"public, no restrictions"	" "	"{"type":"plaintext","title":"U…	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"Multiparty Messages\GLPVD9GQP"	" "	" "	" "	" "	" "	"If we are deemed "automated pr…
"mqkk0323"	" "	"JLI50301951"	"unknown"	" "	" "	"Deborah Tham"	" "	" "	" "	"Amy Ding, Deborah Tham, Eadon …	" "	"JUUL Labs Collection"	"1"	" "	"Sun Jul 21 17:00:00 PDT 2019"	"Wed Sep 25 17:00:00 PDT 2024"	"Wed Sep 25 17:00:00 PDT 2024"	" "	" "	" "	" "	" "	" "	"State of North Carolina, ex re…	"Tobacco"	" "	" "	" "	" "	"GLPVD9GQP"	"Juul"	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"public, no restrictions"	" "	"{"type":"plaintext","title":"U…	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	" "	"Multiparty Messages\GLPVD9GQP"	" "	" "	" "	" "	" "	"my question is age verificatio…

Stitch Slack Messages¶

Query Metadata Using IndustryDocsSearch¶

Get OCR Content from the 'Unknown' item type dataset¶

Sort and Filter the Extracted Data¶

Stitch Conversations¶

Save the Conversation to a Text File¶