rclone/backend/doi/doi.go
Flora Thiebaut 0a0fbe965f doi: add new doi backend
Add a new backend to support mounting datasets published with a digital
object identifier (DOI).
2025-04-15 15:49:36 +02:00

600 lines
17 KiB
Go

// Package doi provides a filesystem interface for digital objects identified by DOIs.
//
// See: https://www.doi.org/the-identifier/what-is-a-doi/
package doi
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"path"
"strings"
"time"
"github.com/rclone/rclone/backend/doi/api"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config/configmap"
"github.com/rclone/rclone/fs/config/configstruct"
"github.com/rclone/rclone/fs/fserrors"
"github.com/rclone/rclone/fs/fshttp"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/lib/cache"
"github.com/rclone/rclone/lib/pacer"
"github.com/rclone/rclone/lib/rest"
)
const (
// the URL of the DOI resolver
//
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
doiResolverAPIURL = "https://doi.org/api"
minSleep = 10 * time.Millisecond
maxSleep = 2 * time.Second
decayConstant = 2 // bigger for slower decay, exponential
)
var (
errorReadOnly = errors.New("doi remotes are read only")
timeUnset = time.Unix(0, 0)
)
func init() {
fsi := &fs.RegInfo{
Name: "doi",
Description: "DOI datasets",
NewFs: NewFs,
CommandHelp: commandHelp,
Options: []fs.Option{{
Name: "doi",
Help: "The DOI or the doi.org URL.",
Required: true,
}, {
Name: fs.ConfigProvider,
Help: `DOI provider.
The DOI provider can be set when rclone does not automatically recognize a supported DOI provider.`,
Examples: []fs.OptionExample{
{
Value: "auto",
Help: "Auto-detect provider",
},
{
Value: string(Zenodo),
Help: "Zenodo",
}, {
Value: string(Dataverse),
Help: "Dataverse",
}, {
Value: string(Invenio),
Help: "Invenio",
}},
Required: false,
Advanced: true,
}},
}
fs.Register(fsi)
}
// Provider defines the type of provider hosting the DOI
type Provider string
const (
// Zenodo provider, see https://zenodo.org
Zenodo Provider = "zenodo"
// Dataverse provider, see https://dataverse.harvard.edu
Dataverse Provider = "dataverse"
// Invenio provider, see https://inveniordm.docs.cern.ch
Invenio Provider = "invenio"
)
// Options defines the configuration for this backend
type Options struct {
Doi string `config:"doi"` // The DOI, a digital identifier of an object, usually a dataset
Provider string `config:"provider"` // The DOI provider
}
// Fs stores the interface to the remote HTTP files
type Fs struct {
name string // name of this remote
root string // the path we are working on
provider Provider // the DOI provider
features *fs.Features // optional features
opt Options // options for this backend
ci *fs.ConfigInfo // global config
endpoint *url.URL // the main API endpoint for this remote
endpointURL string // endpoint as a string
srv *rest.Client // the connection to the server
pacer *fs.Pacer // pacer for API calls
cache *cache.Cache // a cache for the remote metadata
}
// Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading)
type Object struct {
fs *Fs // what this object is part of
remote string // the remote path
contentURL string // the URL where the contents of the file can be downloaded
size int64 // size of the object
modTime time.Time // modification time of the object
contentType string // content type of the object
md5 string // MD5 hash of the object content
}
// Parse the input string as a DOI
// Examples:
// 10.1000/182 -> 10.1000/182
// https://doi.org/10.1000/182 -> 10.1000/182
// doi:10.1000/182 -> 10.1000/182
func parseDoi(doi string) string {
doiURL, err := url.Parse(doi)
if err != nil {
return doi
}
if doiURL.Scheme == "doi" {
return strings.TrimLeft(strings.TrimPrefix(doi, "doi:"), "/")
}
if strings.HasSuffix(doiURL.Hostname(), "doi.org") {
return strings.TrimLeft(doiURL.Path, "/")
}
return doi
}
// Resolve a DOI to a URL
// Reference: https://www.doi.org/the-identifier/resources/factsheets/doi-resolution-documentation
func resolveDoiURL(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, opt *Options) (doiURL *url.URL, err error) {
var result api.DoiResolverResponse
params := url.Values{}
params.Add("index", "1")
opts := rest.Opts{
Method: "GET",
RootURL: doiResolverAPIURL,
Path: "/handles/" + opt.Doi,
Parameters: params,
}
err = pacer.Call(func() (bool, error) {
res, err := srv.CallJSON(ctx, &opts, nil, &result)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, err
}
if result.ResponseCode != 1 {
return nil, fmt.Errorf("could not resolve DOI (error code %d)", result.ResponseCode)
}
resolvedURLStr := ""
for _, value := range result.Values {
if value.Type == "URL" && value.Data.Format == "string" {
valueStr, ok := value.Data.Value.(string)
if !ok {
return nil, fmt.Errorf("could not resolve DOI (incorrect response format)")
}
resolvedURLStr = valueStr
}
}
resolvedURL, err := url.Parse(resolvedURLStr)
if err != nil {
return nil, err
}
return resolvedURL, nil
}
// Resolve the passed configuration into a provider and enpoint
func resolveEndpoint(ctx context.Context, srv *rest.Client, pacer *fs.Pacer, opt *Options) (provider Provider, endpoint *url.URL, err error) {
resolvedURL, err := resolveDoiURL(ctx, srv, pacer, opt)
if err != nil {
return "", nil, err
}
switch opt.Provider {
case string(Dataverse):
return resolveDataverseEndpoint(resolvedURL)
case string(Invenio):
return resolveInvenioEndpoint(ctx, srv, pacer, resolvedURL)
case string(Zenodo):
return resolveZenodoEndpoint(ctx, srv, pacer, resolvedURL, opt.Doi)
}
hostname := strings.ToLower(resolvedURL.Hostname())
if hostname == "dataverse.harvard.edu" || activateDataverse(resolvedURL) {
return resolveDataverseEndpoint(resolvedURL)
}
if hostname == "zenodo.org" || strings.HasSuffix(hostname, ".zenodo.org") {
return resolveZenodoEndpoint(ctx, srv, pacer, resolvedURL, opt.Doi)
}
if activateInvenio(ctx, srv, pacer, resolvedURL) {
return resolveInvenioEndpoint(ctx, srv, pacer, resolvedURL)
}
return "", nil, fmt.Errorf("provider '%s' is not supported", resolvedURL.Hostname())
}
// Make the http connection from the passed options
func (f *Fs) httpConnection(ctx context.Context, opt *Options) (isFile bool, err error) {
provider, endpoint, err := resolveEndpoint(ctx, f.srv, f.pacer, opt)
if err != nil {
return false, err
}
// Update f with the new parameters
f.srv.SetRoot(endpoint.ResolveReference(&url.URL{Path: "/"}).String())
f.endpoint = endpoint
f.endpointURL = endpoint.String()
f.provider = provider
f.opt.Provider = string(provider)
// Determine if the root is a file
switch f.provider {
case Dataverse:
entries, err := f.listDataverseDoiFiles(ctx)
if err != nil {
return false, err
}
for _, entry := range entries {
if entry.remote == f.root {
isFile = true
break
}
}
case Invenio, Zenodo:
isFile = f.root != ""
}
return isFile, nil
}
// retryErrorCodes is a slice of error codes that we will retry
var retryErrorCodes = []int{
429, // Too Many Requests.
500, // Internal Server Error
502, // Bad Gateway
503, // Service Unavailable
504, // Gateway Timeout
509, // Bandwidth Limit Exceeded
}
// shouldRetry returns a boolean as to whether this resp and err
// deserve to be retried. It returns the err as a convenience
func shouldRetry(ctx context.Context, res *http.Response, err error) (bool, error) {
if fserrors.ContextError(ctx, &err) {
return false, err
}
return fserrors.ShouldRetry(err) || fserrors.ShouldRetryHTTP(res, retryErrorCodes), err
}
// NewFs creates a new Fs object from the name and root. It connects to
// the host specified in the config file.
func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
root = strings.Trim(root, "/")
// Parse config into Options struct
opt := new(Options)
err := configstruct.Set(m, opt)
if err != nil {
return nil, err
}
opt.Doi = parseDoi(opt.Doi)
client := fshttp.NewClient(ctx)
ci := fs.GetConfig(ctx)
f := &Fs{
name: name,
root: root,
opt: *opt,
ci: ci,
srv: rest.NewClient(client),
pacer: fs.NewPacer(ctx, pacer.NewDefault(pacer.MinSleep(minSleep), pacer.MaxSleep(maxSleep), pacer.DecayConstant(decayConstant))),
cache: cache.New(),
}
f.features = (&fs.Features{
CanHaveEmptyDirectories: true,
}).Fill(ctx, f)
isFile, err := f.httpConnection(ctx, opt)
if err != nil {
return nil, err
}
if isFile {
// return an error with an fs which points to the parent
newRoot := path.Dir(f.root)
if newRoot == "." {
newRoot = ""
}
f.root = newRoot
return f, fs.ErrorIsFile
}
return f, nil
}
// Name returns the configured name of the file system
func (f *Fs) Name() string {
return f.name
}
// Root returns the root for the filesystem
func (f *Fs) Root() string {
return f.root
}
// String returns the URL for the filesystem
func (f *Fs) String() string {
return fmt.Sprintf("DOI %s", f.opt.Doi)
}
// Features returns the optional features of this Fs
func (f *Fs) Features() *fs.Features {
return f.features
}
// Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s
func (f *Fs) Precision() time.Duration {
return time.Second
}
// Hashes returns hash.HashNone to indicate remote hashing is unavailable
func (f *Fs) Hashes() hash.Set {
return hash.Set(hash.MD5)
// return hash.Set(hash.None)
}
// Mkdir makes the root directory of the Fs object
func (f *Fs) Mkdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// Remove a remote http file object
func (o *Object) Remove(ctx context.Context) error {
return errorReadOnly
}
// Rmdir removes the root directory of the Fs object
func (f *Fs) Rmdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// NewObject creates a new remote http file object
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
var entries []*Object
var err error
switch f.provider {
case Dataverse:
entries, err = f.listDataverseDoiFiles(ctx)
case Invenio, Zenodo:
entries, err = f.listInvevioDoiFiles(ctx)
default:
err = fmt.Errorf("provider type '%s' not supported", f.provider)
}
if err != nil {
return nil, err
}
for _, entry := range entries {
if entry.Remote() == remote {
return entry, nil
}
}
return nil, fs.ErrorObjectNotFound
}
// List the objects and directories in dir into entries. The
// entries can be returned in any order but should be for a
// complete directory.
//
// dir should be "" to list the root, and should not have
// trailing slashes.
//
// This should return ErrDirNotFound if the directory isn't
// found.
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
switch f.provider {
case Dataverse:
return f.listDataverse(ctx, dir)
case Invenio, Zenodo:
return f.listInvenio(ctx, dir)
default:
return nil, fmt.Errorf("provider type '%s' not supported", f.provider)
}
}
// Put in to the remote path with the modTime given of the given size
//
// May create the object even if it returns an error - if so
// will return the object and the error, otherwise will return
// nil and the error
func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// PutStream uploads to the remote path with the modTime given of indeterminate size
func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// Fs is the filesystem this remote http file object is located within
func (o *Object) Fs() fs.Info {
return o.fs
}
// String returns the URL to the remote HTTP file
func (o *Object) String() string {
if o == nil {
return "<nil>"
}
return o.remote
}
// Remote the name of the remote HTTP file, relative to the fs root
func (o *Object) Remote() string {
return o.remote
}
// Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes
func (o *Object) Hash(ctx context.Context, t hash.Type) (string, error) {
if t != hash.MD5 {
return "", hash.ErrUnsupported
}
return o.md5, nil
}
// Size returns the size in bytes of the remote http file
func (o *Object) Size() int64 {
return o.size
}
// ModTime returns the modification time of the remote http file
func (o *Object) ModTime(ctx context.Context) time.Time {
return o.modTime
}
// SetModTime sets the modification and access time to the specified time
//
// it also updates the info field
func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
return errorReadOnly
}
// Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc.)
func (o *Object) Storable() bool {
return true
}
// Open a remote http file object for reading. Seek is supported
func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
fs.FixRangeOption(options, o.size)
opts := rest.Opts{
Method: "GET",
RootURL: o.contentURL,
Options: options,
}
var res *http.Response
err = o.fs.pacer.Call(func() (bool, error) {
res, err = o.fs.srv.Call(ctx, &opts)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, fmt.Errorf("Open failed: %w", err)
}
// Handle non-compliant redirects
if res.Header.Get("Location") != "" {
newURL, err := res.Location()
if err == nil {
opts.RootURL = newURL.String()
err = o.fs.pacer.Call(func() (bool, error) {
res, err = o.fs.srv.Call(ctx, &opts)
return shouldRetry(ctx, res, err)
})
if err != nil {
return nil, fmt.Errorf("Open failed: %w", err)
}
}
}
return res.Body, nil
}
// Update in to the object with the modTime given of the given size
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
return errorReadOnly
}
// MimeType of an Object if known, "" otherwise
func (o *Object) MimeType(ctx context.Context) string {
return o.contentType
}
var commandHelp = []fs.CommandHelp{{
Name: "metadata",
Short: "Show metadata about the DOI.",
Long: `This command returns a JSON object with some information about the DOI.
rclone backend medatadata doi:
It returns a JSON object representing metadata about the DOI.
`,
}, {
Name: "set",
Short: "Set command for updating the config parameters.",
Long: `This set command can be used to update the config parameters
for a running doi backend.
Usage Examples:
rclone backend set doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=doi: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=doi: -o doi=NEW_DOI
The option keys are named as they are in the config file.
This rebuilds the connection to the doi backend when it is called with
the new parameters. Only new parameters need be passed as the values
will default to those currently in use.
It doesn't return anything.
`,
}}
// Command the backend to run a named command
//
// The command run is name
// args may be used to read arguments from
// opts may be used to read optional arguments from
//
// The result should be capable of being JSON encoded
// If it is a string or a []string it will be shown to the user
// otherwise it will be JSON encoded and shown to the user like that
func (f *Fs) Command(ctx context.Context, name string, arg []string, opt map[string]string) (out interface{}, err error) {
switch name {
case "metadata":
return f.ShowMetadata(ctx)
case "set":
newOpt := f.opt
err := configstruct.Set(configmap.Simple(opt), &newOpt)
if err != nil {
return nil, fmt.Errorf("reading config: %w", err)
}
_, err = f.httpConnection(ctx, &newOpt)
if err != nil {
return nil, fmt.Errorf("updating session: %w", err)
}
f.opt = newOpt
keys := []string{}
for k := range opt {
keys = append(keys, k)
}
fs.Logf(f, "Updated config values: %s", strings.Join(keys, ", "))
return nil, nil
default:
return nil, fs.ErrorCommandNotFound
}
}
// ShowMetadata returns some metadata about the corresponding DOI
func (f *Fs) ShowMetadata(ctx context.Context) (metadata interface{}, err error) {
doiURL, err := url.Parse("https://doi.org/" + f.opt.Doi)
if err != nil {
return nil, err
}
info := map[string]any{}
info["DOI"] = f.opt.Doi
info["URL"] = doiURL.String()
info["metadataURL"] = f.endpointURL
info["provider"] = f.provider
return info, nil
}
// Check the interfaces are satisfied
var (
_ fs.Fs = (*Fs)(nil)
_ fs.PutStreamer = (*Fs)(nil)
_ fs.Commander = (*Fs)(nil)
_ fs.Object = (*Object)(nil)
_ fs.MimeTyper = (*Object)(nil)
)