mirror of
https://github.com/rclone/rclone.git
synced 2025-04-19 01:59:00 +08:00

Add a new backend to support mounting datasets published with a digital object identifier (DOI).
600 lines
17 KiB
Go
600 lines
17 KiB
Go
// Package doi provides a filesystem interface for digital objects identified by DOIs.
|
|
//
|
|
// See: https://www.doi.org/the-identifier/what-is-a-doi/
|
|
package doi
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"path"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/rclone/rclone/backend/doi/api"
|
|
"github.com/rclone/rclone/fs"
|
|
"github.com/rclone/rclone/fs/config/configmap"
|
|
"github.com/rclone/rclone/fs/config/configstruct"
|
|
"github.com/rclone/rclone/fs/fserrors"
|
|
"github.com/rclone/rclone/fs/fshttp"
|
|
"github.com/rclone/rclone/fs/hash"
|
|
"github.com/rclone/rclone/lib/cache"
|
|
"github.com/rclone/rclone/lib/pacer"
|
|
"github.com/rclone/rclone/lib/rest"
|
|
)
|
|
|
|
const (
|
|
// the URL of the DOI resolver
|
|
//
|
|
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
|
|
doiResolverAPIURL = "https://doi.org/api"
|
|
minSleep = 10 * time.Millisecond
|
|
maxSleep = 2 * time.Second
|
|
decayConstant = 2 // bigger for slower decay, exponential
|
|
)
|
|
|
|
var (
|
|
errorReadOnly = errors.New("doi remotes are read only")
|
|
timeUnset = time.Unix(0, 0)
|
|
)
|
|
|
|
func init() {
|
|
fsi := &fs.RegInfo{
|
|
Name: "doi",
|
|
Description: "DOI datasets",
|
|
NewFs: NewFs,
|
|
CommandHelp: commandHelp,
|
|
Options: []fs.Option{{
|
|
Name: "doi",
|
|
Help: "The DOI or the doi.org URL.",
|
|
Required: true,
|
|
}, {
|
|
Name: fs.ConfigProvider,
|
|
Help: `DOI provider.
|
|
|
|
The DOI provider can be set when rclone does not automatically recognize a supported DOI provider.`,
|
|
Examples: []fs.OptionExample{
|
|
{
|
|
Value: "auto",
|
|
Help: "Auto-detect provider",
|
|
},
|
|
{
|
|
Value: string(Zenodo),
|
|
Help: "Zenodo",
|
|
}, {
|
|
Value: string(Dataverse),
|
|
Help: "Dataverse",
|
|
}, {
|
|
Value: string(Invenio),
|
|
Help: "Invenio",
|
|
}},
|
|
Required: false,
|
|
Advanced: true,
|
|
}},
|
|
}
|
|
fs.Register(fsi)
|
|
}
|
|
|
|
// Provider defines the type of provider hosting the DOI
|
|
type Provider string
|
|
|
|
const (
|
|
// Zenodo provider, see https://zenodo.org
|
|
Zenodo Provider = "zenodo"
|
|
// Dataverse provider, see https://dataverse.harvard.edu
|
|
Dataverse Provider = "dataverse"
|
|
// Invenio provider, see https://inveniordm.docs.cern.ch
|
|
Invenio Provider = "invenio"
|
|
)
|
|
|
|
// Options defines the configuration for this backend
|
|
type Options struct {
|
|
Doi string `config:"doi"` // The DOI, a digital identifier of an object, usually a dataset
|
|
Provider string `config:"provider"` // The DOI provider
|
|
}
|
|
|
|
// Fs stores the interface to the remote HTTP files
|
|
type Fs struct {
|
|
name string // name of this remote
|
|
root string // the path we are working on
|
|
provider Provider // the DOI provider
|
|
features *fs.Features // optional features
|
|
opt Options // options for this backend
|
|
ci *fs.ConfigInfo // global config
|
|
endpoint *url.URL // the main API endpoint for this remote
|
|
endpointURL string // endpoint as a string
|
|
srv *rest.Client // the connection to the server
|
|
pacer *fs.Pacer // pacer for API calls
|
|
cache *cache.Cache // a cache for the remote metadata
|
|
}
|
|
|
|
// Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading)
|
|
type Object struct {
|
|
fs *Fs // what this object is part of
|
|
remote string // the remote path
|
|
contentURL string // the URL where the contents of the file can be downloaded
|
|
size int64 // size of the object
|
|
modTime time.Time // modification time of the object
|
|
contentType string // content type of the object
|
|
md5 string // MD5 hash of the object content
|
|
}
|
|
|
|
// Parse the input string as a DOI
|
|
// Examples:
|
|
// 10.1000/182 -> 10.1000/182
|
|
// https://doi.org/10.1000/182 -> 10.1000/182
|
|
// doi:10.1000/182 -> 10.1000/182
|
|
func parseDoi(doi string) string {
|
|
doiURL, err := url.Parse(doi)
|
|
if err != nil {
|
|
return doi
|
|
}
|
|
if doiURL.Scheme == "doi" {
|
|
return strings.TrimLeft(strings.TrimPrefix(doi, "doi:"), "/")
|
|
}
|
|
if strings.HasSuffix(doiURL.Hostname(), "doi.org") {
|
|
return strings.TrimLeft(doiURL.Path, "/")
|
|
}
|
|
return doi
|
|
}
|
|
|
|
// Resolve a DOI to a URL
|
|
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
|
|
func resolveDoiURL(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, opt *Options) (doiURL *url.URL, err error) {
|
|
var result api.DoiResolverResponse
|
|
params := url.Values{}
|
|
params.Add("index", "1")
|
|
opts := rest.Opts{
|
|
Method: "GET",
|
|
RootURL: doiResolverAPIURL,
|
|
Path: "/handles/" + opt.Doi,
|
|
Parameters: params,
|
|
}
|
|
err = pacer.Call(func() (bool, error) {
|
|
res, err := srv.CallJSON(ctx, &opts, nil, &result)
|
|
return shouldRetry(ctx, res, err)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if result.ResponseCode != 1 {
|
|
return nil, fmt.Errorf("could not resolve DOI (error code %d)", result.ResponseCode)
|
|
}
|
|
resolvedURLStr := ""
|
|
for _, value := range result.Values {
|
|
if value.Type == "URL" && value.Data.Format == "string" {
|
|
valueStr, ok := value.Data.Value.(string)
|
|
if !ok {
|
|
return nil, fmt.Errorf("could not resolve DOI (incorrect response format)")
|
|
}
|
|
resolvedURLStr = valueStr
|
|
}
|
|
}
|
|
resolvedURL, err := url.Parse(resolvedURLStr)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return resolvedURL, nil
|
|
}
|
|
|
|
// Resolve the passed configuration into a provider and enpoint
|
|
func resolveEndpoint(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, opt *Options) (provider Provider, endpoint *url.URL, err error) {
|
|
resolvedURL, err := resolveDoiURL(ctx, srv, pacer, opt)
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
|
|
switch opt.Provider {
|
|
case string(Dataverse):
|
|
return resolveDataverseEndpoint(resolvedURL)
|
|
case string(Invenio):
|
|
return resolveInvenioEndpoint(ctx, srv, pacer, resolvedURL)
|
|
case string(Zenodo):
|
|
return resolveZenodoEndpoint(ctx, srv, pacer, resolvedURL, opt.Doi)
|
|
}
|
|
|
|
hostname := strings.ToLower(resolvedURL.Hostname())
|
|
if hostname == "dataverse.harvard.edu" || activateDataverse(resolvedURL) {
|
|
return resolveDataverseEndpoint(resolvedURL)
|
|
}
|
|
if hostname == "zenodo.org" || strings.HasSuffix(hostname, ".zenodo.org") {
|
|
return resolveZenodoEndpoint(ctx, srv, pacer, resolvedURL, opt.Doi)
|
|
}
|
|
if activateInvenio(ctx, srv, pacer, resolvedURL) {
|
|
return resolveInvenioEndpoint(ctx, srv, pacer, resolvedURL)
|
|
}
|
|
|
|
return "", nil, fmt.Errorf("provider '%s' is not supported", resolvedURL.Hostname())
|
|
}
|
|
|
|
// Make the http connection from the passed options
|
|
func (f *Fs) httpConnection(ctx context.Context, opt *Options) (isFile bool, err error) {
|
|
provider, endpoint, err := resolveEndpoint(ctx, f.srv, f.pacer, opt)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
// Update f with the new parameters
|
|
f.srv.SetRoot(endpoint.ResolveReference(&url.URL{Path: "/"}).String())
|
|
f.endpoint = endpoint
|
|
f.endpointURL = endpoint.String()
|
|
f.provider = provider
|
|
f.opt.Provider = string(provider)
|
|
|
|
// Determine if the root is a file
|
|
switch f.provider {
|
|
case Dataverse:
|
|
entries, err := f.listDataverseDoiFiles(ctx)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
for _, entry := range entries {
|
|
if entry.remote == f.root {
|
|
isFile = true
|
|
break
|
|
}
|
|
}
|
|
case Invenio, Zenodo:
|
|
isFile = f.root != ""
|
|
}
|
|
|
|
return isFile, nil
|
|
}
|
|
|
|
// retryErrorCodes is a slice of error codes that we will retry
|
|
var retryErrorCodes = []int{
|
|
429, // Too Many Requests.
|
|
500, // Internal Server Error
|
|
502, // Bad Gateway
|
|
503, // Service Unavailable
|
|
504, // Gateway Timeout
|
|
509, // Bandwidth Limit Exceeded
|
|
}
|
|
|
|
// shouldRetry returns a boolean as to whether this resp and err
|
|
// deserve to be retried. It returns the err as a convenience
|
|
func shouldRetry(ctx context.Context, res *http.Response, err error) (bool, error) {
|
|
if fserrors.ContextError(ctx, &err) {
|
|
return false, err
|
|
}
|
|
return fserrors.ShouldRetry(err) || fserrors.ShouldRetryHTTP(res, retryErrorCodes), err
|
|
}
|
|
|
|
// NewFs creates a new Fs object from the name and root. It connects to
|
|
// the host specified in the config file.
|
|
func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
|
|
root = strings.Trim(root, "/")
|
|
|
|
// Parse config into Options struct
|
|
opt := new(Options)
|
|
err := configstruct.Set(m, opt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
opt.Doi = parseDoi(opt.Doi)
|
|
|
|
client := fshttp.NewClient(ctx)
|
|
ci := fs.GetConfig(ctx)
|
|
f := &Fs{
|
|
name: name,
|
|
root: root,
|
|
opt: *opt,
|
|
ci: ci,
|
|
srv: rest.NewClient(client),
|
|
pacer: fs.NewPacer(ctx, pacer.NewDefault(pacer.MinSleep(minSleep), pacer.MaxSleep(maxSleep), pacer.DecayConstant(decayConstant))),
|
|
cache: cache.New(),
|
|
}
|
|
f.features = (&fs.Features{
|
|
CanHaveEmptyDirectories: true,
|
|
}).Fill(ctx, f)
|
|
|
|
isFile, err := f.httpConnection(ctx, opt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if isFile {
|
|
// return an error with an fs which points to the parent
|
|
newRoot := path.Dir(f.root)
|
|
if newRoot == "." {
|
|
newRoot = ""
|
|
}
|
|
f.root = newRoot
|
|
return f, fs.ErrorIsFile
|
|
}
|
|
|
|
return f, nil
|
|
}
|
|
|
|
// Name returns the configured name of the file system
|
|
func (f *Fs) Name() string {
|
|
return f.name
|
|
}
|
|
|
|
// Root returns the root for the filesystem
|
|
func (f *Fs) Root() string {
|
|
return f.root
|
|
}
|
|
|
|
// String returns the URL for the filesystem
|
|
func (f *Fs) String() string {
|
|
return fmt.Sprintf("DOI %s", f.opt.Doi)
|
|
}
|
|
|
|
// Features returns the optional features of this Fs
|
|
func (f *Fs) Features() *fs.Features {
|
|
return f.features
|
|
}
|
|
|
|
// Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s
|
|
func (f *Fs) Precision() time.Duration {
|
|
return time.Second
|
|
}
|
|
|
|
// Hashes returns hash.HashNone to indicate remote hashing is unavailable
|
|
func (f *Fs) Hashes() hash.Set {
|
|
return hash.Set(hash.MD5)
|
|
// return hash.Set(hash.None)
|
|
}
|
|
|
|
// Mkdir makes the root directory of the Fs object
|
|
func (f *Fs) Mkdir(ctx context.Context, dir string) error {
|
|
return errorReadOnly
|
|
}
|
|
|
|
// Remove a remote http file object
|
|
func (o *Object) Remove(ctx context.Context) error {
|
|
return errorReadOnly
|
|
}
|
|
|
|
// Rmdir removes the root directory of the Fs object
|
|
func (f *Fs) Rmdir(ctx context.Context, dir string) error {
|
|
return errorReadOnly
|
|
}
|
|
|
|
// NewObject creates a new remote http file object
|
|
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
|
|
var entries []*Object
|
|
var err error
|
|
switch f.provider {
|
|
case Dataverse:
|
|
entries, err = f.listDataverseDoiFiles(ctx)
|
|
case Invenio, Zenodo:
|
|
entries, err = f.listInvevioDoiFiles(ctx)
|
|
default:
|
|
err = fmt.Errorf("provider type '%s' not supported", f.provider)
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, entry := range entries {
|
|
if entry.Remote() == remote {
|
|
return entry, nil
|
|
}
|
|
}
|
|
|
|
return nil, fs.ErrorObjectNotFound
|
|
}
|
|
|
|
// List the objects and directories in dir into entries. The
|
|
// entries can be returned in any order but should be for a
|
|
// complete directory.
|
|
//
|
|
// dir should be "" to list the root, and should not have
|
|
// trailing slashes.
|
|
//
|
|
// This should return ErrDirNotFound if the directory isn't
|
|
// found.
|
|
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
|
|
switch f.provider {
|
|
case Dataverse:
|
|
return f.listDataverse(ctx, dir)
|
|
case Invenio, Zenodo:
|
|
return f.listInvenio(ctx, dir)
|
|
default:
|
|
return nil, fmt.Errorf("provider type '%s' not supported", f.provider)
|
|
}
|
|
}
|
|
|
|
// Put in to the remote path with the modTime given of the given size
|
|
//
|
|
// May create the object even if it returns an error - if so
|
|
// will return the object and the error, otherwise will return
|
|
// nil and the error
|
|
func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
|
|
return nil, errorReadOnly
|
|
}
|
|
|
|
// PutStream uploads to the remote path with the modTime given of indeterminate size
|
|
func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
|
|
return nil, errorReadOnly
|
|
}
|
|
|
|
// Fs is the filesystem this remote http file object is located within
|
|
func (o *Object) Fs() fs.Info {
|
|
return o.fs
|
|
}
|
|
|
|
// String returns the URL to the remote HTTP file
|
|
func (o *Object) String() string {
|
|
if o == nil {
|
|
return "<nil>"
|
|
}
|
|
return o.remote
|
|
}
|
|
|
|
// Remote the name of the remote HTTP file, relative to the fs root
|
|
func (o *Object) Remote() string {
|
|
return o.remote
|
|
}
|
|
|
|
// Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes
|
|
func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) {
|
|
if t != hash.MD5 {
|
|
return "", hash.ErrUnsupported
|
|
}
|
|
return o.md5, nil
|
|
}
|
|
|
|
// Size returns the size in bytes of the remote http file
|
|
func (o *Object) Size() int64 {
|
|
return o.size
|
|
}
|
|
|
|
// ModTime returns the modification time of the remote http file
|
|
func (o *Object) ModTime(ctx context.Context) time.Time {
|
|
return o.modTime
|
|
}
|
|
|
|
// SetModTime sets the modification and access time to the specified time
|
|
//
|
|
// it also updates the info field
|
|
func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
|
|
return errorReadOnly
|
|
}
|
|
|
|
// Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc.)
|
|
func (o *Object) Storable() bool {
|
|
return true
|
|
}
|
|
|
|
// Open a remote http file object for reading. Seek is supported
|
|
func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
|
|
fs.FixRangeOption(options, o.size)
|
|
opts := rest.Opts{
|
|
Method: "GET",
|
|
RootURL: o.contentURL,
|
|
Options: options,
|
|
}
|
|
var res *http.Response
|
|
err = o.fs.pacer.Call(func() (bool, error) {
|
|
res, err = o.fs.srv.Call(ctx, &opts)
|
|
return shouldRetry(ctx, res, err)
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Open failed: %w", err)
|
|
}
|
|
|
|
// Handle non-compliant redirects
|
|
if res.Header.Get("Location") != "" {
|
|
newURL, err := res.Location()
|
|
if err == nil {
|
|
opts.RootURL = newURL.String()
|
|
err = o.fs.pacer.Call(func() (bool, error) {
|
|
res, err = o.fs.srv.Call(ctx, &opts)
|
|
return shouldRetry(ctx, res, err)
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("Open failed: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return res.Body, nil
|
|
}
|
|
|
|
// Update in to the object with the modTime given of the given size
|
|
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
|
|
return errorReadOnly
|
|
}
|
|
|
|
// MimeType of an Object if known, "" otherwise
|
|
func (o *Object) MimeType(ctx context.Context) string {
|
|
return o.contentType
|
|
}
|
|
|
|
var commandHelp = []fs.CommandHelp{{
|
|
Name: "metadata",
|
|
Short: "Show metadata about the DOI.",
|
|
Long: `This command returns a JSON object with some information about the DOI.
|
|
|
|
rclone backend medatadata doi:
|
|
|
|
It returns a JSON object representing metadata about the DOI.
|
|
`,
|
|
}, {
|
|
Name: "set",
|
|
Short: "Set command for updating the config parameters.",
|
|
Long: `This set command can be used to update the config parameters
|
|
for a running doi backend.
|
|
|
|
Usage Examples:
|
|
|
|
rclone backend set doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
|
|
rclone rc backend/command command=set fs=doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
|
|
rclone rc backend/command command=set fs=doi: -o doi=NEW_DOI
|
|
|
|
The option keys are named as they are in the config file.
|
|
|
|
This rebuilds the connection to the doi backend when it is called with
|
|
the new parameters. Only new parameters need be passed as the values
|
|
will default to those currently in use.
|
|
|
|
It doesn't return anything.
|
|
`,
|
|
}}
|
|
|
|
// Command the backend to run a named command
|
|
//
|
|
// The command run is name
|
|
// args may be used to read arguments from
|
|
// opts may be used to read optional arguments from
|
|
//
|
|
// The result should be capable of being JSON encoded
|
|
// If it is a string or a []string it will be shown to the user
|
|
// otherwise it will be JSON encoded and shown to the user like that
|
|
func (f *Fs) Command(ctx context.Context, name string, arg []string, opt map[string]string) (out interface{}, err error) {
|
|
switch name {
|
|
case "metadata":
|
|
return f.ShowMetadata(ctx)
|
|
case "set":
|
|
newOpt := f.opt
|
|
err := configstruct.Set(configmap.Simple(opt), &newOpt)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("reading config: %w", err)
|
|
}
|
|
_, err = f.httpConnection(ctx, &newOpt)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("updating session: %w", err)
|
|
}
|
|
f.opt = newOpt
|
|
keys := []string{}
|
|
for k := range opt {
|
|
keys = append(keys, k)
|
|
}
|
|
fs.Logf(f, "Updated config values: %s", strings.Join(keys, ", "))
|
|
return nil, nil
|
|
default:
|
|
return nil, fs.ErrorCommandNotFound
|
|
}
|
|
}
|
|
|
|
// ShowMetadata returns some metadata about the corresponding DOI
|
|
func (f *Fs) ShowMetadata(ctx context.Context) (metadata interface{}, err error) {
|
|
doiURL, err := url.Parse("https://doi.org/" + f.opt.Doi)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
info := map[string]any{}
|
|
info["DOI"] = f.opt.Doi
|
|
info["URL"] = doiURL.String()
|
|
info["metadataURL"] = f.endpointURL
|
|
info["provider"] = f.provider
|
|
return info, nil
|
|
}
|
|
|
|
// Check the interfaces are satisfied
|
|
var (
|
|
_ fs.Fs = (*Fs)(nil)
|
|
_ fs.PutStreamer = (*Fs)(nil)
|
|
_ fs.Commander = (*Fs)(nil)
|
|
_ fs.Object = (*Object)(nil)
|
|
_ fs.MimeTyper = (*Object)(nil)
|
|
)
|