type.inbound
and (
// directly attached PDF
any(filter(attachments, .file_type == "pdf"),
(
// table of contents detection
(
any(file.explode(.),
strings.contains(.scan.ocr.raw, 'TABLE OF CONTEN')
)
// the Table of contents can be on another page
and any(file.explode(.),
regex.icontains(.scan.ocr.raw,
'(?:[\r\n]|^)+(?:\s*1\s*(?:\.|:))?\s*Introduction'
)
or strings.icontains(.scan.ocr.raw, 'marked in red')
)
)
// heading of sections within observed documents
or (
any(file.explode(.),
any(.scan.strings.strings,
any([
'Employee Acknowledgement',
'Document Summary',
'appraisal overview',
'accessing full appraisal',
],
.. =~ .
)
)
)
)
)
and (
(
(
strings.icontains(beta.parse_exif(.).creator, 'HeadlessChrome')
or strings.icontains(beta.parse_exif(.).creator, 'Chromium')
)
and strings.icontains(beta.parse_exif(.).producer, 'Skia/PDF')
)
or (
any(beta.parse_exif(.).fields,
.key == "Creator"
and (.value == "" or strings.istarts_with(.value, 'wkhtmltopdf'))
)
and any(beta.parse_exif(.).fields,
.key == "Title"
and (
.value == ""
// company handbook
or .value in ('Company HandBook')
// appraisal themes
or strings.icontains(.value,
'Employee Performance Appraisal'
)
)
)
and strings.istarts_with(beta.parse_exif(.).producer, 'QT ')
)
)
)
// or within an attached EML
or any(filter(attachments,
.content_type == "message/rfc822" or .file_extension == "eml"
),
any(filter(file.parse_eml(.).attachments, .file_type == "pdf"),
(
// table of contents detection
(
any(file.explode(.),
strings.contains(.scan.ocr.raw, 'TABLE OF CONTEN')
)
// the Table of contents can be on another page
and any(file.explode(.),
regex.icontains(.scan.ocr.raw,
'(?:[\r\n]|^)+(?:\s*1\s*(?:\.|:))?\s*Introduction'
)
or strings.icontains(.scan.ocr.raw, 'marked in red')
)
)
// heading of sections within observed documents
or (
any(file.explode(.),
any(.scan.strings.strings,
any([
'Employee Acknowledgement',
'Document Summary',
'appraisal overview',
'accessing full appraisal',
],
.. =~ .
)
)
)
)
)
and (
(
(
strings.icontains(beta.parse_exif(.).creator,
'HeadlessChrome'
)
or strings.icontains(beta.parse_exif(.).creator, 'Chromium')
)
and strings.icontains(beta.parse_exif(.).producer, 'Skia/PDF')
)
or (
any(beta.parse_exif(.).fields,
.key == "Creator"
and (
.value == ""
or strings.istarts_with(.value, 'wkhtmltopdf')
)
)
and any(beta.parse_exif(.).fields,
.key == "Title"
and (
.value == ""
// company handbook
or .value in ('Company HandBook')
// appraisal themes
or strings.icontains(.value,
'Employee Performance Appraisal'
)
)
)
and strings.istarts_with(beta.parse_exif(.).producer, 'QT ')
)
)
)
)
)
Playground
Test against your own EMLs or sample data.