Unverified Commit 859f4fd0 authored by Sebastian Schüpbach's avatar Sebastian Schüpbach
Browse files

add documentation

parent f7e9de53
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "aho-corasick"
version = "0.7.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5"
dependencies = [
"memchr",
]
[[package]]
name = "anyhow"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1"
[[package]]
name = "atty"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"libc",
"winapi",
]
[[package]]
name = "autocfg"
version = "1.0.1"
......@@ -24,6 +50,19 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "env_logger"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26ecb66b4bdca6c1409b40fb255eefc2bd4f6d135dab3c3124f80ffa2a9661e"
dependencies = [
"atty",
"humantime",
"log",
"regex",
"termcolor",
]
[[package]]
name = "fnv"
version = "1.0.7"
......@@ -193,6 +232,12 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
[[package]]
name = "humantime"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c1ad908cc71012b7bea4d0c53ba96a8cba9962f048fa68d143376143d863b7a"
[[package]]
name = "hyper"
version = "0.14.2"
......@@ -276,9 +321,14 @@ dependencies = [
name = "media-file-distributor"
version = "0.1.0"
dependencies = [
"anyhow",
"env_logger",
"futures",
"hyper",
"log",
"serde",
"tokio",
"toml",
]
[[package]]
......@@ -448,12 +498,50 @@ version = "0.1.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
[[package]]
name = "regex"
version = "1.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"thread_local",
]
[[package]]
name = "regex-syntax"
version = "0.6.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581"
[[package]]
name = "scopeguard"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "serde"
version = "1.0.119"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9bdd36f49e35b61d49efd8aa7fc068fd295961fd2286d0b2ee9a4c7a14e99cc3"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.119"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "552954ce79a059ddd5fd68c271592374bd15cab2274970380c000118aeffe1cd"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "signal-hook-registry"
version = "1.3.0"
......@@ -497,6 +585,24 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "termcolor"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4"
dependencies = [
"winapi-util",
]
[[package]]
name = "thread_local"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb9bc092d0d51e76b2b19d9d85534ffc9ec2db959a2523cdae0697e2972cd447"
dependencies = [
"lazy_static",
]
[[package]]
name = "tokio"
version = "1.0.1"
......@@ -554,6 +660,15 @@ dependencies = [
"tokio-stream",
]
[[package]]
name = "toml"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa"
dependencies = [
"serde",
]
[[package]]
name = "tower-service"
version = "0.3.0"
......@@ -628,6 +743,15 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
......
......@@ -7,6 +7,18 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
anyhow = "1.0"
env_logger = "0.8"
log = "0.4"
futures = "0.3.9"
hyper = { version = "0.14.2", features = ["full"] }
serde = { version = "1.0", features = ["derive"] }
tokio = { version = "1", features = ["full"] }
toml = "0.5"
[profile.release]
# Optimize for size
opt-level = 'z'
lto = true
codegen-units = 1
panic = 'abort'
This diff is collapsed.
# Media File Distributor
A small service which provides access to media files hosted on the Memobase sFTP server without requiring the client to install an additional sFTP client library.
At the moment the following endpoints are supported:
* `/media/<record-id>`: Fetch a media file with the respective id. The id consists of the recordSetId (a three-letter code and a three-digit sequential number) as well as the proper id of the record. E.g. `baz-001-MEI_67473`.
* `/thumbnail/<record-id>`: The same for thumbnails (media files in the `thumbnails` directory)
* `/refresh`: Refresh the file cache. This happens also automatically after a predefined duration (see below).
## Installation
You need Rust and Cargo for compilation (see [here](https://rustup.rs/) for instructions).
```sh
git clone https://gitlab.switch.ch/memoriav/memobase-2020/services/import-process/media-file-distributor.git
cd media-file-distributor
rustup target install x86_64-unknown-linux-musl # We use this target to avoid glibc as prerequisite. Of course this isn't strictly mandatory
cargo build --target x86_64-unknown-linux-musl --release # After compilation you should find a binary in ./target/x86_64-unknown-linux-musl/release/
strip target/x86_64-unknown-linux-musl/release/media-file-distributor # Reduce the file size a bit; this isn't mandatory
```
Afterwards, use the example configuration (`config.example.toml`) as template for your own configuration. See description in the file for details.
## Usage
Start the application by giving the path to your configuration file, e.g.
```shell
./media-file-distributor config.toml
```
You can set a customised log level (default: `WARN`) by setting the level in the `RUST_LOG` environmental variable:
```shell
export RUST_LOG=info
```
## File considerations
The server assumes a file tree according to this scheme:
```shell
.
├── <recordSet-1>
│ └── thumbnails
│ ├── <file-1>
│ ├── <file-2>
│ └── <file-3>
├── <recordSet-2>
│ ├── media
│ ├── <file-1>
│ ├── <file-1>
│ └── <...>
│ └── thumbnails
│ ├── <file-1>
│ └── <file-2>
├── <...>
│ └── media
│ ├── <file-1>
│ └── <file-2>
```
Importantly, the `media` and `thumbnails` directory have to be direct subfolders of the recordSet directories and must directly contain the media files.
When generating the IDs for the files, whitespace in filenames are replaced with underscored (`_`).
host = "0.0.0.0:3000" # Host name and port of the running application
base_path = "/swissbib_index/mb_sftp" # Root of the sFTP directory
refresh_after_sec = 3600 # Duration in seconds after which the cache is refreshed (i.e. the whole file tree is reread)
\ No newline at end of file
/*
* Media File Distributor
* Copyright (C) 2021 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
mod media_folder_utils;
mod service;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Server, Response, Body, Request, StatusCode};
use std::convert::Infallible;
use std::net::SocketAddr;
use std::ops::Range;
use crate::media_folder_utils::MediaFileCache;
use crate::service::Svc;
use anyhow::{Context, Result};
use hyper::service::make_service_fn;
use hyper::{Error, Server};
use log::info;
use serde::Deserialize;
use std::env;
use std::fs;
use futures::StreamExt;
use std::collections::HashMap;
use std::path::Path;
use crate::media_folder_utils::{visit_dirs, MediaFileCache};
use crate::service::MakeSvc;
use std::fs::File;
use std::io::prelude::*;
use std::net::SocketAddr;
use std::str::FromStr;
use std::sync::{Arc, Mutex};
use tokio::time::Duration;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let addr = SocketAddr::from(([127, 0, 0, 1], 3000));
env_logger::init();
let args = env::args();
let config_dir = args.skip(1).take(1).collect::<String>();
let config = parse_config_file(&config_dir)
.context(format!("Can't parse config on {} file!", &config_dir))?;
let addr = SocketAddr::from_str(&config.host).context("Host name invalid")?;
info!("Server starts on {}", &config.host);
let server = Server::bind(&addr).serve(MakeSvc{
base_dir: args.skip(1).take(1).collect::<String>(),
outdated_after: Duration::new(7200, 0),
});
info!(
"Reading in media files starting with path {}",
&config.base_path
);
let mut media_cache = MediaFileCache::new(config.base_path.clone());
media_cache.refresh().context("Refreshing cache failed!")?;
info!(
"Done reading in media files. Found {} media files and {} thumbnails",
media_cache.dissemination_copies_size(),
media_cache.thumbnails_size()
);
let media_cache = Arc::new(Mutex::new(media_cache));
let refresh_period = config.refresh_after_sec.unwrap_or(7200);
let outdated_after = Duration::new(refresh_period, 0);
info!("Setting refresh period to {}s", refresh_period);
let server = Server::bind(&addr).serve(make_service_fn(move |_| {
let base_dir_cloned = config.base_path.clone();
let media_cache_cloned = media_cache.clone();
async move {
Ok::<_, Error>(Svc {
media_cache: media_cache_cloned,
base_dir: base_dir_cloned,
outdated_after,
})
}
}));
if let Err(e) = server.await {
eprintln!("server error: {}", e);
}
Ok(())
}
pub fn parse_config_file(path: &str) -> Result<Config> {
let mut file = File::open(path)?;
let mut contents = String::new();
file.read_to_string(&mut contents)?;
toml::from_str(&contents).context("Parsing of TOML config file failed")
}
#[derive(Deserialize)]
pub struct Config {
pub host: String,
pub base_path: String,
pub refresh_after_sec: Option<u64>,
}
/*
* Media File Distributor
* Copyright (C) 2021 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use anyhow::{Context, Result};
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::time::{Instant, Duration};
use std::fs;
use std::io::Result;
use std::path::{Path, PathBuf};
use std::time::{Duration, Instant};
/// Caches the media files found in the indicated directory tree.
#[derive(Debug)]
pub struct MediaFileCache {
dissemination_copies: HashMap<String, String>,
thumbnails: HashMap<String, String>,
base_path: String,
created_on: Instant,
}
impl MediaFileCache {
pub fn new() -> Self {
/// Create a new instance
pub fn new(base_path: String) -> Self {
MediaFileCache {
dissemination_copies: HashMap::new(),
thumbnails: HashMap::new(),
base_path,
created_on: Instant::now(),
}
}
/// Checks if the cache is outdated according to `outdated_after`
pub fn is_outdated(&self, outdated_after: &Duration) -> bool {
let now = Instant::now();
now.duration_since(self.created_on) >= *outdated_after
}
pub fn merge(&mut self, other: MediaFileCache) -> () {
/// Size of mapping for dissemination copies (i.e. the total of cached dissemination copies)
pub fn dissemination_copies_size(&self) -> usize {
self.dissemination_copies.len()
}
/// Size of mapping for thumbnails (i.e. the total of cached thumbnails)
pub fn thumbnails_size(&self) -> usize {
self.thumbnails.len()
}
/// Merges cache with `other`
pub fn merge(&mut self, other: MediaFileCache) {
self.dissemination_copies.extend(other.dissemination_copies);
self.thumbnails.extend(other.thumbnails);
self.created_on = Instant::now();
}
pub fn add_file(&mut self, path: &PathBuf) -> () {
if let Some(p) = path.to_str() {
let path_elements = p.split("/").collect::<Vec<&str>>();
let path_size = path_elements.len();
if path_elements.get(path_size - 2).unwrap() == &"media" {
path_elements.get(path_size - 3);
// self.dissemination_copies.insert();
} else if path_elements.get(path_size - 2).unwrap() == &"thumbnails" {
path_elements.get(path_size - 3);
}
/// Adds a single file to the cache
pub fn add_file(&mut self, path: &PathBuf) -> Result<()> {
let id = path
.file_stem()
.context("Can't extract file stem")?
.to_str()
.context("Can't convert to str")?;
let path = path.to_str().context("Can't read path!")?;
let path_elements = path.split('/').collect::<Vec<&str>>();
let path_size = path_elements.len();
if path_elements
.get(path_size - 2)
.context("Path is invalid")?
== &"media"
{
let collection_id = *path_elements
.get(path_size - 3)
.context("Path is invalid")?;
self.dissemination_copies
.insert(format!("{}-{}", collection_id, id.replace(" ", "_")), path.to_owned());
} else if path_elements
.get(path_size - 2)
.context("Path is invalid")?
== &"thumbnails"
{
let collection_id = *path_elements
.get(path_size - 3)
.context("Path is invalid")?;
self.thumbnails
.insert(format!("{}-{}", collection_id, id.replace(" ", "_")), path.to_owned());
}
Ok(())
}
/// Gets file path for id
pub fn get_file_path(&self, id: &str, media_type: &str) -> Option<String> {
if media_type == "media" {
self.dissemination_copies.get(id).map(|entry| entry.to_owned())
self.dissemination_copies
.get(id)
.map(|entry| entry.to_owned())
} else if media_type == "thumbnail" {
self.thumbnails.get(id).map(|entry| entry.to_owned())
} else {
None
}
}
}
fn fetch_media_folder(dir: &Path) -> Option<&str> {
if let Some(file_name) = dir.file_name() {
if let Some(str) = file_name.to_str() {
if dir.is_dir() && str.contains("media") {
Some("media")
} else if dir.is_dir() && str.contains("thumbnails") {
Some("thumbnails")
} else {
None
}
} else {
None
}
} else {
None
/// Reloads the cache
pub fn refresh(&mut self) -> Result<()> {
let temp_cache =
visit_dirs(Path::new(&self.base_path)).context("Coulnd't scan for files")?;
self.dissemination_copies = temp_cache.dissemination_copies;
self.thumbnails = temp_cache.thumbnails;
self.created_on = Instant::now();
Ok(())
}
}
pub fn visit_dirs(dir: &Path) -> Result<MediaFileCache> {
let mut media_file_cache = MediaFileCache::new();
/// Scans media file directory recursively
fn visit_dirs(dir: &Path) -> Result<MediaFileCache> {
let mut media_file_cache = MediaFileCache::new(
dir.to_str()
.context("Couldn't convert path to str")?
.to_owned(),
);
for entry in fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
media_file_cache.merge(visit_dirs(&path)?)
media_file_cache.merge(visit_dirs(&path)?);
} else {
media_file_cache.add_file(&path);
media_file_cache
.add_file(&path)
.context("Can't add file to cache!")?;
}
}
Ok(media_file_cache)
}
\ No newline at end of file
}
use hyper::{Body, Request, Response, StatusCode};
/*
* Media File Distributor
* Copyright (C) 2021 Memoriav
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use crate::media_folder_utils::MediaFileCache;
use anyhow::{Context as AContext, Result};
use hyper::http::header;
use hyper::service::Service;
use crate::media_folder_utils::{MediaFileCache, visit_dirs};
use hyper::{Body, Request, Response, StatusCode};
use log::{error, info, warn};
use std::fs::File;
use std::future::Future;
use std::io::{BufReader, Read};
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::Duration;
use std::path::Path;
pub struct Svc {
media_cache: MediaFileCache,
base_dir: String,
outdated_after: Duration,
pub media_cache: Arc<Mutex<MediaFileCache>>,
pub base_dir: String,
pub outdated_after: Duration,
}
impl Svc {
fn fetch_file(&mut self, id: String, file_type: &str) -> Result<Response<Body>, hyper::Error> {
if self.media_cache.is_outdated(&self.outdated_after) {
let base_dir = self.base_dir.clone();
self.media_cache = visit_dirs(Path::new(&base_dir)).unwrap()
}
match self.media_cache.get_file_path(&id, file_type) {
Some(p) => Ok(Response::new(Body::from(p))),
None => {
let mut not_found = Response::default();