Skip to content

Media Upload Tools #465

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,33 @@ AUTH0_DOMAIN=https://dev-fmjy7n5n.us.auth0.com
AUTH0_KID=uciP2tJdJ4BKWoz73Fmln

MAPTILES_WORKING_DIR=./maptiles

# Google Cloud keys and config
GCS_ENABLE_SERVICES=false
GCS_PROJECT_ID=someproject-45031206
GCS_CLOUD_BUCKET_ID=openbeta-test

# Only needed if you intend to use PULL subscriptions, which is only likely
# in the event that you are working on something to do with the GCS cloud,
# otherwise you can just leave it off
# eg: projects/someproject-450306/subscriptions/pull-sub
GCS_NOTIFICATIONS_SUBSCRIPTION=
# only required in the event that your server is supposed to recieve
# events posted to it by a push subscriber already set up to point to
# you in the GCS. This is practically unheard of in development environments
# since it required instrumentation of HTTPS and DNS setup - or some ngrok
# wizardry, I suppose.
#
# eg: /rest/gcs-event
GCS_MEDIA_HOOK_URL=
# This var is not required, except in the case that you would like to run
# the FULL integration tests on the hook url.
# e.g: https://stg-api.openbeta.io/rest/gcs-event
GCS_MEDIA_HOOK_PUBLIC=

# Check out the readme to see how user can be set up for your purposes.
# e.g: [email protected]
GCS_BUCKET_CLIENT_EMAIL=
# starts as BEGIN PRIVATE KEY
GCS_PRIVATE_KEY=

4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ yarn-error.log*
lerna-debug.log*
.DS_Store

bucket

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

key.json

# Runtime data
pids
*.pid
Expand Down
18 changes: 15 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,12 @@
"@types/supertest": "^2.0.12",
"@types/underscore": "^1.11.4",
"cross-env": "^7.0.3",
"deepmerge": "^4.3.1",
"file-type": "^20.4.1",
"husky": "^8.0.1",
"image-decode": "^1.2.2",
"image-dimensions": "^2.3.0",
"image-size": "^2.0.2",
"jest": "^29.7.0",
"jest-extended": "^4.0.2",
"mongodb-memory-server": "^10.1.2",
Expand All @@ -27,6 +32,7 @@
"dependencies": {
"@apollo/server": "^4.11.2",
"@babel/runtime": "^7.17.2",
"@google-cloud/pubsub": "^4.11.0",
"@google-cloud/storage": "^6.9.5",
"@graphql-tools/schema": "^8.3.1",
"@openbeta/sandbag": "^0.0.51",
Expand All @@ -36,6 +42,7 @@
"@turf/circle": "^6.5.0",
"@turf/convex": "^6.5.0",
"@turf/helpers": "^6.5.0",
"@types/jsonwebtoken": "^9.0.9",
"@types/uuid": "^8.3.3",
"apollo-datasource-mongodb": "^0.6.0",
"auth0": "^3.4.0",
Expand All @@ -47,6 +54,7 @@
"dotenv": "^16.4.4",
"express": "^4.18.2",
"glob": "^10.2.2",
"google-auth-library": "^9.15.1",
"graphql": "^16.9.0",
"graphql-middleware": "^6.1.31",
"graphql-shield": "^7.5.0",
Expand All @@ -58,12 +66,13 @@
"jwks-rsa": "^2.1.4",
"mongoose": "^7.8.3",
"mongoose-lean-virtuals": "^1.0.0",
"nanoid": "^5.1.5",
"node-fetch": "2",
"p-limit": "^4.0.0",
"pino": "^9.5.0",
"pino-logflare": "^0.4.2",
"sanitize-html": "^2.7.2",
"sharp": "^0.32.0",
"sharp": "^0.34.1",
"typesense": "^1.8.2",
"underscore": "^1.13.2",
"uuid": "^8.3.2",
Expand Down Expand Up @@ -103,8 +112,11 @@
"ignore": [
"build",
"hacks",
"**/*.test.ts",
"db-migrations"
"db-migrations",
"src/db/export/**/*.test.ts",
"src/__tests__/bulkImport.test.ts",
"src/model/__tests__/BulkDataSource.test.ts",
"src/model/__tests__/MutableAreaDataSource.test.ts"
]
},
"type": "module",
Expand Down
13 changes: 12 additions & 1 deletion src/db/MediaObjectSchema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,18 @@ const schema = new Schema<MediaObject>({
height: { type: Schema.Types.Number, required: true },
size: { type: Schema.Types.Number, required: true },
format: { type: Schema.Types.String, required: true },
entityTags: [EntitySchema]
entityTags: [EntitySchema],
expiresAt: {
type: Date,
// Defines a TTL index on this path.
expires: 0,
// We want the ttl to be disabled by default since really we want to
// prevent any scenario in which a developer pushes out an update that
// causes media to go missing.
default: undefined,
// We don't need to keep track of this aftert the pending status is lapsed
required: false
}
}, { _id: true, timestamps: true, toJSON: { versionKey: false }, toObject: { versionKey: false } })

/**
Expand Down
6 changes: 5 additions & 1 deletion src/db/MediaObjectTypes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export interface MediaObject {
createdAt: Date
size: number
entityTags?: EntityTag[]
expiresAt?: Date
}

export interface EntityTag {
Expand Down Expand Up @@ -103,9 +104,12 @@ export interface EntityTagDeleteInput {
/**
* GQL user input type for add media mutation
*/
export type MediaObjectGQLInput = Pick<MediaObject, 'mediaUrl' | 'width' | 'height' | 'format' | 'size'> & {
export type MediaObjectGQLInput = Pick<MediaObject, 'width' | 'height' | 'format' | 'size'> & {
userUuid: string
mediaUrl?: string
entityTag?: Omit<AddEntityTagGQLInput, 'mediaId'>
filename?: string
maskFilename?: boolean
}

/**
Expand Down
135 changes: 135 additions & 0 deletions src/google-cloud/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# Google Cloud Services (GCS) Integration

We currently store our media in a GCS bucket, many of the below concepts and approaches are generalizable to other storage bucket providers.

## Abstract

Rather than trying to ingest image data through the GQL endpoint, we route the users upload content via a side channel. There are good reasons to do this, especially insofar as resource management on our VMs is concerned.

```mermaid
sequenceDiagram
User->>Openbeta GQL: Here is an image file
Openbeta GQL->>Openbeta GQL: Check user auth
Openbeta GQL->>Google Cloud: Can I please have a signed url for my user?
Google Cloud-->>Openbeta GQL: ofc bb 😘
Openbeta GQL->> Openbeta GQL: Create entry in database for the pending media
Openbeta GQL -->>User: Here is an un-reified media object, upload to <signed url>
User->>Google Cloud: Here is the file
Google Cloud-->>User: Okay
Google Cloud->>Openbeta GQL: A new media object just got added with <filename>
Openbeta GQL->Openbeta GQL: Update the un-reified media object such that it is finalized
```
> From the discussion in [the relevant issue](https://github.com/OpenBeta/openbeta-graphql/issues/443)

This isn't quite exhaustive in terms of the real sequence but it covers the main design. The actual implementation provided support [two patterns](https://cloud.google.com/storage/docs/pubsub-notifications), one for single-node and one for multiple nodes behind a load balanacer.

[Pull pattern](https://cloud.google.com/pubsub/docs/subscriber#subscription_type_comparison) is fine for people hosting a single node in front of their data, or for developers who like to develop with integration to a real life GCS instance.

[Push pattern](https://cloud.google.com/pubsub/docs/subscriber#subscription_type_comparison) works by having a route open `/rest/gc-event`, for example. Google will then send notifications to this endpoint. This will let your load balancing strategy take care of the finer details and you won't need to worry about it.

## How to upload images (Client Docs)

```mermaid
sequenceDiagram
participant Google Cloud
User->>Openbeta GQL: Here is an image file
Openbeta GQL -->>User: Here is an un-reified media object, upload to <signed url>
User->>Google Cloud: Here is a file
Google Cloud-->>User: Okay
```

This is quite nice, since realistically you can write a query for (n) number of new media objects, and get back signed endpoints for all of them. The google storage bucket is a titan of robustness, so you shouldn't have any trouble uploading to the endpoint.


## Developing

Integration with cloud services is notorious for the headaches it can cause while in a development environment. We are sadly not immune to this, though we try to provide a sane development environment so that contributors can get up and running as quickly as possible.

### Challenges

in principal this logic is perfectly straightforward to test (see the [integration tests](./__tests__/integration.test.ts)) which are quite thorough at checking the sanity of all ends of the pipeline. The challenge is the same one that all integration tests have: It can be a real head-ache to set up all the moving pieces consistently.


### Suggestion to developers

Unless you have to lay hands on bugs to do with integration directly, write your code so that it can be mocked in a sane way.

### How to perform integration testing

You will need to set up google cloud services and update your .env.local with the vars you need. I have included some pointers on how to set up your infrastructure for minimal pain.

## Deployment

This portion of the document will not dabble too much in the details and is not a guide, it will just specify the requisites for parameters.

### Setting up a bucket

Go to your console, navigate to your project (Or make one), and then [Create a bucket](https://cloud.google.com/storage/docs/creating-buckets) inside that project context.

### Setting up notifications for bucket

[sadly notifications don't exactly come out of the box](https://cloud.google.com/storage/docs/reporting-changes#prereqs). After creating your bucket, you can [create a Topic](https://cloud.google.com/pubsub/docs/publish-receive-messages-console#create_a_topic), which will act as a pointer that can recieve events from the bucket.

### Setting up a service account

- Read/Write/Delete on a Storage Bucket: This requires the Storage Object Admin role (roles/storage.objectAdmin) on the specific bucket. This role grants comprehensive control over objects within the bucket.
- Ability to Sign URLs: This requires the storage.buckets.get and storage.objects.create permissions. These are typically included in roles like Storage Object Admin or Storage Admin (roles/storage.admin).
- Permissions to Subscribe to a Pull Subscriber: This requires the Pub/Sub Subscriber role (roles/pubsub.subscriber) on the specific subscription. (OPTIONAL)

To add a service account to your Google Cloud Storage (GCS) project with the specified permissions, you'll need to perform the following steps using the Google Cloud Console or the Google Cloud CLI.

Using the console you can try

1. Create a Service Account (if you don't have one already):
- Go to the Service accounts page in the Google Cloud Console.
- Select your project.
- Click + CREATE SERVICE ACCOUNT.
- Enter a Service account name, Service account ID (will be auto-generated), and an optional Service account description.
- Click CREATE AND CONTINUE.
1. Grant Permissions to the Service Account:
2. Download the Service Account Key:
- Go back to the Service accounts page.
- Find the service account you created.
- Click the three dots (Actions) in the Actions column.
- Select Manage keys.
- Click ADD KEY and then Create new key.
- Choose JSON as the Key type (recommended).
- Click CREATE. The JSON key file will be downloaded to your computer. Keep this file secure. You can drop that file into the repo root or you can extract the private key from it and set an OS env var with it. The API will take either.

#### If you want to use **Pull Subscriptions**

You will also need to add the `roles/pubsub.subscriber` role on your service account, for the subscriber you wish to use (or project-wide)

### Env vars

```bash
# Google Cloud keys and config
GCS_ENABLE_SERVICES=true
GCS_PROJECT_ID=someproject-45031206
GCS_CLOUD_BUCKET_ID=openbeta-test

# Only needed if you intend to use PULL subscriptions, which is only likely
# in the event that you are working on something to do with the GCS cloud,
# otherwise you can just leave it off
# eg: projects/someproject-450306/subscriptions/pull-sub
GCS_NOTIFICATIONS_SUBSCRIPTION=""

# only required in the event that your server is supposed to recieve
# events posted to it by a push subscriber already set up to point to
# you in the GCS. This is practically unheard of in development environments
# since it required instrumentation of HTTPS and DNS setup - or some ngrok
# wizardry, I suppose.
#
# eg: /rest/gcs-event
GCS_MEDIA_HOOK_URL=
# This var is not required, except in the case that you would like to run
# the FULL integration tests on the hook url.
# e.g: https://stg-api.openbeta.io/rest/gcs-event
GCS_MEDIA_HOOK_PUBLIC=

# Check out the readme to see how user can be set up for your purposes.
GCS_BUCKET_CLIENT_EMAIL="[email protected]"
GCS_PRIVATE_KEY="BEGIN PRIVATE KEY"
```

You can now set up your env vars in your `.env.local` file, based on the features you woud like to enable. Choose one of `GCS_NOTIFICATIONS_SUBSCRIPTION` or `GCS_MEDIA_HOOK_URL` depending on which strategy you prefer.
98 changes: 98 additions & 0 deletions src/google-cloud/__tests__/adapter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import MutableMediaDataSource from '../../model/MutableMediaDataSource'
import inMemoryDB from '../../utils/inMemoryDB'
import { mediaAdded, standardMessageHandlingLifecycle } from '../adapter-interface'
import { MediaObject } from '../../db/MediaObjectTypes'
import { jest } from '@jest/globals'

describe('Media storage notification adapter tests', () => {
let mediaDs: MutableMediaDataSource
// Our handler will take a trip past the database so we will need to provision the
// database for this suite of tests
beforeAll(async () => { await inMemoryDB.connect(); mediaDs = MutableMediaDataSource.getInstance() })
afterAll(async () => await inMemoryDB.close())

async function unreifiedMedia (): Promise<MediaObject> {
const [ref] = await mediaDs.mediaObjectModel.insertMany([{
mediaUrl: `${process.uptime()}.test`,
width: 100,
height: 100,
format: 'test',
size: 1200,
// 10 second expiry
expiresAt: new Date().getTime() + 10_000
}])

return await mediaDs.mediaObjectModel.findById(ref._id).orFail(new Error('woops'))
}

async function reifiedMedia (): Promise<MediaObject> {
const [ref] = await mediaDs.mediaObjectModel.insertMany([{
mediaUrl: `${process.uptime()}.test`,
width: 100,
height: 100,
format: 'test',
size: 1200
}])

return await mediaDs.mediaObjectModel.findById(ref._id).orFail(new Error('woops'))
}

describe('standardMessageHandlingLifecycle', () => {
let mockWork: jest.Mock<(media: MediaObject, mutableDs: MutableMediaDataSource) => Promise<void>>

beforeEach(() => {
mockWork = jest.fn()
})

it('should return early if media object is not found in the database, throwing no error and not performing work', async () => {
await standardMessageHandlingLifecycle({ objectId: 'no such thing' }, mockWork)
expect(await mediaDs.mediaObjectModel.findOne({ mediaUrl: 'test-url' }))
expect(mockWork).not.toHaveBeenCalled()
})

it('should execute the work function for a valid, unreified media object found in the database', async () => {
const media = await unreifiedMedia()
await standardMessageHandlingLifecycle({ objectId: media.mediaUrl }, mockWork)
expect(mockWork).toHaveBeenCalled()
})

it('should re-throw error from work function', async () => {
const media = await unreifiedMedia()
await expect(standardMessageHandlingLifecycle(
{ objectId: media.mediaUrl },
() => {
throw new Error('error in work')
}
)
).rejects.toThrow(new Error('error in work'))
})
})

describe('mediaAdded', () => {
it('should not attempt to update if the media object is not found', async () => {
const media = await unreifiedMedia()
await mediaDs.mediaObjectModel.deleteOne({ _id: media._id })
await mediaAdded({ objectId: media.mediaUrl })
await mediaAdded({ objectId: 'does not exist' })
})

it('should not fail if media object has expiresAt as null (already reified)', async () => {
const media = await reifiedMedia()
const mockWork: jest.Mock<(media: MediaObject, mutableDs: MutableMediaDataSource) => Promise<void>> = jest.fn()
await standardMessageHandlingLifecycle({ objectId: media.mediaUrl }, mockWork)
expect(mockWork).toHaveBeenCalled()
})

it('should update the media object to unset expiresAt if found and not already reified', async () => {
const media = await unreifiedMedia()
await mediaAdded({ objectId: media.mediaUrl })
expect(await mediaDs.mediaObjectModel.findOne({ mediaUrl: media.mediaUrl }).then(x => x?.expiresAt)).toBeUndefined()
})

it('should not attempt to update if the media object is already reified', async () => {
const media = await reifiedMedia()
await mediaAdded({ objectId: media.mediaUrl })
expect(await mediaDs.mediaObjectModel.findOne({ mediaUrl: media.mediaUrl }).then(x => x?.expiresAt)).toBeUndefined()
})
})
})
Loading
Loading