From 2fe6a37929893b9241176c20f5735a8d24f9b171 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Wed, 10 Jun 2026 15:33:55 -0300 Subject: [PATCH 01/13] Black --- article/models.py | 635 +++++++++++++++++++++++++++------------------- 1 file changed, 379 insertions(+), 256 deletions(-) diff --git a/article/models.py b/article/models.py index 85a4e434..2469175e 100755 --- a/article/models.py +++ b/article/models.py @@ -7,8 +7,7 @@ from django.core.files.base import ContentFile from django.db import IntegrityError, models -from django.db.models import Q, Count, Min -from django.db.utils import DataError +from django.db.models import Count, Q from django.utils import timezone from django.utils.translation import gettext_lazy as _ from django_prometheus.models import ExportModelOperationsMixin @@ -16,38 +15,36 @@ from modelcluster.fields import ParentalKey from modelcluster.models import ClusterableModel from packtools.sps.formats import crossref, pmc, pubmed +from packtools.sps.libs.requester import NonRetryableError from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre, generate_finger_print from wagtail.admin.panels import FieldPanel, InlinePanel, ObjectList, TabbedInterface from wagtail.models import Orderable from wagtailautocomplete.edit_handlers import AutocompletePanel -from packtools.sps.libs.requester import NonRetryableError from article import choices from article.utils.url_builder import ArticleURLBuilder from collection.models import Collection from core.forms import CoreAdminModelForm -from core.models import CommonControlField # Ajuste o import conforme sua estrutura from core.models import ( BaseExporter, BaseLegacyRecord, + CharFieldLangMixin, + CommonControlField, FlexibleDate, Language, License, LicenseStatement, TextLanguageMixin, - CharFieldLangMixin, ) from core.utils.utils import NonRetryableError, fetch_data from doi.models import DOI from doi_manager.models import CrossRefConfiguration -from institution.models import Publisher, Sponsor +from institution.models import Sponsor from issue.models import Issue, TableOfContents -from journal.models import Journal, SciELOJournal -from pid_provider.choices import PPXML_STATUS_DONE +from journal.models import Journal +from organization.models import NormAffiliation from pid_provider.models import PidProviderXML from pid_provider.provider import PidProvider -from location.models import Location -from organization.models import Organization, NormAffiliation from researcher.models import AffiliationMixin, CollabMixin, ResearchNameMixin from tracker.models import BaseEvent, EventSaveError, UnexpectedEvent from vocabulary.models import Keyword @@ -55,16 +52,22 @@ class RequestXMLException(Exception): """Exceção personalizada para erros na requisição de XML""" + pass + class XMLException(Exception): """Exceção personalizada para erros na requisição de XML""" + pass + class UnableToRegisterPIDError(Exception): """Exceção personalizada para erros ao registrar PID""" + pass + class AMArticle(BaseLegacyRecord): """ Modelo que representa a coleta de dados de Issue na API Article Meta. @@ -196,7 +199,7 @@ class Article( max_length=255, null=True, blank=True, - help_text=_("Armazena valores inválidos recebidos do XML") + help_text=_("Armazena valores inválidos recebidos do XML"), ) peer_review_stats = models.JSONField( @@ -210,21 +213,21 @@ class Article( max_length=10, null=True, blank=True, - help_text=_("Preprint publication date in ISO format (YYYY-MM-DD)") + help_text=_("Preprint publication date in ISO format (YYYY-MM-DD)"), ) received_dateiso = models.CharField( _("Received Date (ISO)"), max_length=10, null=True, blank=True, - help_text=_("Date the article was received, in ISO format (YYYY-MM-DD)") + help_text=_("Date the article was received, in ISO format (YYYY-MM-DD)"), ) accepted_dateiso = models.CharField( _("Accepted Date (ISO)"), max_length=10, null=True, blank=True, - help_text=_("Date the article was accepted, in ISO format (YYYY-MM-DD)") + help_text=_("Date the article was accepted, in ISO format (YYYY-MM-DD)"), ) days_preprint_to_received = models.IntegerField(null=True, blank=True) days_received_to_accepted = models.IntegerField(null=True, blank=True) @@ -852,7 +855,11 @@ def get_text_langs(self, collection_acron_list=None, fmt=None): params["fmt__in"] = ["html", "pdf"] if collection_acron_list: params["collection__acron3__in"] = collection_acron_list - for item in self.article_availability.filter(lang__isnull=False, **params).select_related('collection', 'lang').distinct(): + for item in ( + self.article_availability.filter(lang__isnull=False, **params) + .select_related("collection", "lang") + .distinct() + ): acron3 = item.collection.acron3 code2 = item.lang.code2 fmt = item.fmt @@ -865,7 +872,9 @@ def get_text_langs(self, collection_acron_list=None, fmt=None): def add_event(self, user, name): return ArticleEvent.create(user, self, name) - def add_related_article(self, user, href, ext_link_type, related_type, related_article=None): + def add_related_article( + self, user, href, ext_link_type, related_type, related_article=None + ): return RelatedArticle.create_or_update( user, self, @@ -935,7 +944,7 @@ def find_duplicated_pkg_names(cls, journal=None, journal_id=None): .filter(count__gt=1) .values_list("sps_pkg_name", flat=True) ) - + @classmethod def find_duplicated_pid_v2(cls, journal=None, journal_id=None): # Busca em ambos os campos de ISSN @@ -956,7 +965,14 @@ def find_duplicated_pid_v2(cls, journal=None, journal_id=None): ) @classmethod - def deduplicate_items(cls, user, journal=None, journal_id=None, mark_as_duplicated=False, deduplicate=False): + def deduplicate_items( + cls, + user, + journal=None, + journal_id=None, + mark_as_duplicated=False, + deduplicate=False, + ): """ Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos. @@ -964,9 +980,7 @@ def deduplicate_items(cls, user, journal=None, journal_id=None, mark_as_duplicat issns: Lista de ISSNs para verificar duplicatas. user: Usuário que está executando a operação. """ - article_duplicated_pid_v2 = cls.find_duplicated_pid_v2( - journal, journal_id - ) + article_duplicated_pid_v2 = cls.find_duplicated_pid_v2(journal, journal_id) if article_duplicated_pid_v2.exists(): if mark_as_duplicated: cls.objects.filter(pid_v2__in=article_duplicated_pid_v2).exclude( @@ -983,9 +997,9 @@ def deduplicate_items(cls, user, journal=None, journal_id=None, mark_as_duplicat ) if article_duplicated_pkg_names.exists(): if mark_as_duplicated: - cls.objects.filter(sps_pkg_name__in=article_duplicated_pkg_names).exclude( - data_status=choices.DATA_STATUS_DUPLICATED - ).update( + cls.objects.filter( + sps_pkg_name__in=article_duplicated_pkg_names + ).exclude(data_status=choices.DATA_STATUS_DUPLICATED).update( data_status=choices.DATA_STATUS_DUPLICATED, ) if deduplicate: @@ -1073,7 +1087,6 @@ def create_or_update(cls, user, article, language, text): except cls.DoesNotExist: return cls.create(user, article, language, text) - @classmethod def get( cls, @@ -1475,7 +1488,6 @@ def article_directory_path(instance, filename): class ArticleFormat(CommonControlField): - article = ParentalKey( Article, null=True, @@ -1582,7 +1594,7 @@ def save_file(self, filename, content): if finger_print != self.finger_print: try: self.file.delete() - except Exception as e: + except Exception: pass self.file.save(filename, ContentFile(content)) self.finger_print = finger_print @@ -1801,7 +1813,15 @@ def get(cls, url): raise ValueError("ArticleSource.get requires url") @classmethod - def create(cls, user, url=None, source_date=None, am_article=None, force_update=None, auto_solve_pid_conflict=False): + def create( + cls, + user, + url=None, + source_date=None, + am_article=None, + force_update=None, + auto_solve_pid_conflict=False, + ): if not url: raise ValueError("ArticleSource.create requires url") @@ -1812,14 +1832,22 @@ def create(cls, user, url=None, source_date=None, am_article=None, force_update= obj.source_date = source_date obj.am_article = am_article obj.status = cls.StatusChoices.PENDING - obj.add_pid_provider(user, force_update, auto_solve_pid_conflict=auto_solve_pid_conflict) + obj.add_pid_provider( + user, force_update, auto_solve_pid_conflict=auto_solve_pid_conflict + ) return obj except IntegrityError: return cls.get(url=url) @classmethod def create_or_update( - cls, user, url=None, source_date=None, am_article=None, force_update=None, auto_solve_pid_conflict=False + cls, + user, + url=None, + source_date=None, + am_article=None, + force_update=None, + auto_solve_pid_conflict=False, ): try: logging.info( @@ -1834,7 +1862,9 @@ def create_or_update( obj.updated_by = user obj.source_date = source_date obj.am_article = am_article - obj.add_pid_provider(user, force_update, auto_solve_pid_conflict=auto_solve_pid_conflict) + obj.add_pid_provider( + user, force_update, auto_solve_pid_conflict=auto_solve_pid_conflict + ) return obj except cls.DoesNotExist: return cls.create( @@ -1843,7 +1873,7 @@ def create_or_update( source_date=source_date, am_article=am_article, force_update=force_update, - auto_solve_pid_conflict=auto_solve_pid_conflict + auto_solve_pid_conflict=auto_solve_pid_conflict, ) @cached_property @@ -1856,12 +1886,12 @@ def xml_with_pre(self): if self.file and self.file.path and os.path.isfile(self.file.path): try: return XMLWithPre.from_file(self.file.path) - except Exception as e: + except Exception: pass if self.url: try: return list(XMLWithPre.create(uri=self.url))[0] - except Exception as e: + except Exception: pass @cached_property @@ -1879,16 +1909,15 @@ def request_xml(self, detail): try: xml_with_pre = list(XMLWithPre.create(uri=self.url))[0] self.save_file( - f"{xml_with_pre.sps_pkg_name}.xml", xml_with_pre.tostring(pretty_print=True) + f"{xml_with_pre.sps_pkg_name}.xml", + xml_with_pre.tostring(pretty_print=True), ) except NonRetryableError as e: raise RequestXMLException( f"Non-retryable error while requesting XML: {e}" ) from e except Exception as e: - raise XMLException( - f"Error while requesting XML: {e}" - ) from e + raise XMLException(f"Error while requesting XML: {e}") from e def save_file(self, filename, content): try: @@ -1989,11 +2018,15 @@ def get_queryset_to_complete_data( @property def is_completed(self): if not self.pid_provider_xml: - logging.info(f"Not completed: ArticleSource {self.url} has no pid_provider_xml") + logging.info( + f"Not completed: ArticleSource {self.url} has no pid_provider_xml" + ) return False try: if not self.pid_provider_xml.xml_with_pre: - logging.info(f"Not completed: ArticleSource {self.url} has pid_provider_xml but no xml_with_pre") + logging.info( + f"Not completed: ArticleSource {self.url} has pid_provider_xml but no xml_with_pre" + ) return False except Exception: pass @@ -2004,7 +2037,9 @@ def is_completed(self): logging.info(f"Not completed: ArticleSource {self.url} has no file") return False if not self.file.path or not os.path.isfile(self.file.path): - logging.info(f"Not completed: ArticleSource {self.url} has file path invalid or file does not exist") + logging.info( + f"Not completed: ArticleSource {self.url} has file path invalid or file does not exist" + ) return False if self.status != ArticleSource.StatusChoices.COMPLETED: self.status = ArticleSource.StatusChoices.COMPLETED @@ -2033,9 +2068,7 @@ def add_pid_provider(self, user, force_update=False, auto_solve_pid_conflict=Fal # --- Etapa 1: request_xml --- has_valid_file = ( - self.file - and self.file.name - and os.path.isfile(self.file.path) + self.file and self.file.name and os.path.isfile(self.file.path) ) if force_update or not has_valid_file: @@ -2053,16 +2086,11 @@ def add_pid_provider(self, user, force_update=False, auto_solve_pid_conflict=Fal if force_update or not has_pid_provider: logging.info(f"Requesting PID for {self.url}") - self.request_pid( - user, detail, force_update, auto_solve_pid_conflict - ) - logging.info( - f"PID requested successfully for {self.pid_provider_xml}" - ) + self.request_pid(user, detail, force_update, auto_solve_pid_conflict) + logging.info(f"PID requested successfully for {self.pid_provider_xml}") else: logging.info( - f"Skipping request_pid: pid_provider_xml already set " - f"for {self.url}" + f"Skipping request_pid: pid_provider_xml already set for {self.url}" ) detail.append("request_pid skipped (pid_provider_xml already set)") @@ -2319,24 +2347,25 @@ def check_url(url, timeout=None): try: fetch_data(url, timeout=timeout or 30) return True - except Exception as e: + except Exception: raise class ArticleAffiliation(AffiliationMixin, CommonControlField): """ Represents an affiliation associated with an article. - + Inherits from AffiliationMixin (which provides raw organization fields and organization FK) and CommonControlField (for audit fields). """ + article = ParentalKey( Article, on_delete=models.CASCADE, related_name="affiliations", verbose_name=_("Article"), ) - + # Raw level fields for organization division raw_level_1 = models.CharField( _("Raw Level 1"), @@ -2359,7 +2388,7 @@ class ArticleAffiliation(AffiliationMixin, CommonControlField): blank=True, help_text=_("Raw third level of organization division"), ) - + # Normalized affiliation reference normalized = models.ForeignKey( NormAffiliation, @@ -2428,28 +2457,28 @@ def __str__(self): def get(cls, article, organization=None, **kwargs): """ Get an article affiliation by article and organization or other parameters. - + Args: article: Article instance organization: Organization instance (optional) - **kwargs: Additional filter parameters including raw_level_1, raw_level_2, + **kwargs: Additional filter parameters including raw_level_1, raw_level_2, raw_level_3, normalized, and any raw organization fields - + Returns: ArticleAffiliation instance - + Raises: ValueError: If article is not provided cls.DoesNotExist: If no matching instance found """ if not article: raise ValueError("ArticleAffiliation.get requires article parameter") - + params = {"article": article} if organization: params["organization"] = organization params.update(kwargs) - + try: return cls.objects.get(**params) except cls.MultipleObjectsReturned: @@ -2459,41 +2488,41 @@ def get(cls, article, organization=None, **kwargs): def create(cls, user, article, organization=None, **kwargs): """ Create a new article affiliation. - + Args: user: User creating the instance article: Article instance organization: Organization instance (optional) **kwargs: Additional field values including raw fields and level fields - + Returns: New ArticleAffiliation instance """ if not article: raise ValueError("ArticleAffiliation.create requires article parameter") - + obj = cls() obj.article = article if organization: obj.organization = organization - + # Set raw organization fields if provided (using parent class constant) for field in cls.RAW_ORGANIZATION_FIELDS: if field in kwargs: setattr(obj, field, kwargs[field]) - + # Set raw level fields if provided - for field in ['raw_level_1', 'raw_level_2', 'raw_level_3']: + for field in ["raw_level_1", "raw_level_2", "raw_level_3"]: if field in kwargs: setattr(obj, field, kwargs[field]) - + # Set normalized field if provided - if 'normalized' in kwargs: - obj.normalized = kwargs['normalized'] - + if "normalized" in kwargs: + obj.normalized = kwargs["normalized"] + if user: obj.creator = user - + obj.save() return obj @@ -2501,70 +2530,82 @@ def create(cls, user, article, organization=None, **kwargs): def create_or_update(cls, user, article, organization=None, **kwargs): """ Create a new article affiliation or update an existing one. - + Lookup strategy (in priority order): 1. If organization is provided, lookup by article + organization 2. Otherwise, lookup by article + raw_text if provided 3. Otherwise, lookup by article + raw_institution_name if provided - + Args: user: User creating/updating the instance article: Article instance organization: Organization instance (optional, used for lookup) **kwargs: Additional field values including level fields - + Returns: ArticleAffiliation instance (created or updated) """ if not article: - raise ValueError("ArticleAffiliation.create_or_update requires article parameter") - + raise ValueError( + "ArticleAffiliation.create_or_update requires article parameter" + ) + try: # Build lookup parameters lookup_params = {"article": article} if organization: lookup_params["organization"] = organization - elif 'raw_text' in kwargs and kwargs['raw_text']: - lookup_params["raw_text"] = kwargs['raw_text'] - elif 'raw_institution_name' in kwargs and kwargs['raw_institution_name']: - lookup_params["raw_institution_name"] = kwargs['raw_institution_name'] - + elif "raw_text" in kwargs and kwargs["raw_text"]: + lookup_params["raw_text"] = kwargs["raw_text"] + elif "raw_institution_name" in kwargs and kwargs["raw_institution_name"]: + lookup_params["raw_institution_name"] = kwargs["raw_institution_name"] + obj = cls.get(**lookup_params) - + # Update fields if organization: obj.organization = organization - + # Update raw organization fields (using parent class constant) for field in cls.RAW_ORGANIZATION_FIELDS: if field in kwargs: setattr(obj, field, kwargs[field]) - + # Update raw level fields - for field in ['raw_level_1', 'raw_level_2', 'raw_level_3']: + for field in ["raw_level_1", "raw_level_2", "raw_level_3"]: if field in kwargs: setattr(obj, field, kwargs[field]) - + # Update normalized field - if 'normalized' in kwargs: - obj.normalized = kwargs['normalized'] - + if "normalized" in kwargs: + obj.normalized = kwargs["normalized"] + if user: obj.updated_by = user - + obj.save() return obj - + except cls.DoesNotExist: - return cls.create(user=user, article=article, organization=organization, **kwargs) + return cls.create( + user=user, article=article, organization=organization, **kwargs + ) - def set_normalized(self, user, organization=None, location=None, level_1=None, level_2=None, level_3=None): + def set_normalized( + self, + user, + organization=None, + location=None, + level_1=None, + level_2=None, + level_3=None, + ): """ Set the normalized affiliation for this article affiliation. - + This method creates or retrieves a NormAffiliation instance and links it to this ArticleAffiliation. - + Args: user: User performing the operation organization: Organization instance (optional) @@ -2572,7 +2613,7 @@ def set_normalized(self, user, organization=None, location=None, level_1=None, l level_1: First level of division (optional) level_2: Second level of division (optional) level_3: Third level of division (optional) - + Returns: The updated ArticleAffiliation instance """ @@ -2592,23 +2633,29 @@ def set_normalized(self, user, organization=None, location=None, level_1=None, l def update_normalized(self, user, **kwargs): """ Update the normalized affiliation linked to this article affiliation. - + If no normalized affiliation exists, creates one. If updating would violate the unique_together constraint, reuses an existing matching NormAffiliation. - + Args: user: User performing the operation - **kwargs: Fields to update in NormAffiliation (organization, location, + **kwargs: Fields to update in NormAffiliation (organization, location, level_1, level_2, level_3) - + Returns: The updated ArticleAffiliation instance """ if self.normalized: # Check if we're updating any unique_together fields - unique_fields = ('organization', 'location', 'level_1', 'level_2', 'level_3') + unique_fields = ( + "organization", + "location", + "level_1", + "level_2", + "level_3", + ) updating_unique = any(field in kwargs for field in unique_fields) - + if updating_unique: # Build the target combination of unique_together values target_values = {} @@ -2617,17 +2664,20 @@ def update_normalized(self, user, **kwargs): target_values[field] = kwargs[field] else: target_values[field] = getattr(self.normalized, field, None) - + # Check if another NormAffiliation with this combination already exists - from django.db.models import Q - existing = NormAffiliation.objects.filter( - organization=target_values['organization'], - location=target_values['location'], - level_1=target_values['level_1'], - level_2=target_values['level_2'], - level_3=target_values['level_3'], - ).exclude(pk=self.normalized.pk).first() - + existing = ( + NormAffiliation.objects.filter( + organization=target_values["organization"], + location=target_values["location"], + level_1=target_values["level_1"], + level_2=target_values["level_2"], + level_3=target_values["level_3"], + ) + .exclude(pk=self.normalized.pk) + .first() + ) + if existing: # Reuse the existing NormAffiliation instead of updating self.normalized = existing @@ -2648,7 +2698,7 @@ def update_normalized(self, user, **kwargs): else: # Create new normalized affiliation self.normalized = NormAffiliation.create(user=user, **kwargs) - + self.updated_by = user self.save() return self @@ -2656,10 +2706,10 @@ def update_normalized(self, user, **kwargs): def clear_normalized(self, user): """ Remove the link to the normalized affiliation. - + Args: user: User performing the operation - + Returns: The updated ArticleAffiliation instance """ @@ -2671,22 +2721,23 @@ def clear_normalized(self, user): class ContribCollab(CollabMixin, CommonControlField): """ - Represents a collaboration (research group or consortium) associated with + Represents a collaboration (research group or consortium) associated with an article contributor's affiliation. - + This model is used to track when contributors work as part of a larger collaboration, such as research groups, consortiums, or multi-institutional - initiatives. It links a collaboration name to a specific article and + initiatives. It links a collaboration name to a specific article and optionally to an affiliation. - + Use cases: - Research consortiums (e.g., "COVID-19 Research Network") - Multi-institutional research groups - Collaborative initiatives credited in publications - + Inherits from CollabMixin (which provides the collab field) and CommonControlField (for audit fields). """ + article = ParentalKey( Article, on_delete=models.CASCADE, @@ -2729,16 +2780,16 @@ def __str__(self): def get(cls, article, collab, affiliation=None, **kwargs): """ Get a contrib collab by article, collab, and affiliation or other parameters. - + Args: article: Article instance (required) collab: Collaboration name (required) affiliation: ArticleAffiliation instance (optional) **kwargs: Additional filter parameters - + Returns: ContribCollab instance - + Raises: ValueError: If article or collab is not provided cls.DoesNotExist: If no matching instance found @@ -2747,12 +2798,12 @@ def get(cls, article, collab, affiliation=None, **kwargs): raise ValueError("ContribCollab.get requires article parameter") if not collab: raise ValueError("ContribCollab.get requires collab parameter") - + params = {"article": article, "collab": collab} if affiliation: params["affiliation"] = affiliation params.update(kwargs) - + try: return cls.objects.get(**params) except cls.MultipleObjectsReturned: @@ -2762,17 +2813,17 @@ def get(cls, article, collab, affiliation=None, **kwargs): def create(cls, user, article, collab, affiliation=None, **kwargs): """ Create a new contrib collab. - + Args: user: User creating the instance article: Article instance (required) collab: Collaboration name (required) affiliation: ArticleAffiliation instance (optional) **kwargs: Additional field values - + Returns: New ContribCollab instance - + Raises: ValueError: If article or collab is not provided """ @@ -2780,21 +2831,21 @@ def create(cls, user, article, collab, affiliation=None, **kwargs): raise ValueError("ContribCollab.create requires article parameter") if not collab: raise ValueError("ContribCollab.create requires collab parameter") - + obj = cls() obj.article = article obj.collab = collab if affiliation: obj.affiliation = affiliation - + # Set any additional fields from kwargs for key, value in kwargs.items(): if hasattr(obj, key): setattr(obj, key, value) - + if user: obj.creator = user - + obj.save() return obj @@ -2802,73 +2853,84 @@ def create(cls, user, article, collab, affiliation=None, **kwargs): def create_or_update(cls, user, article, collab, affiliation=None, **kwargs): """ Create a new contrib collab or update an existing one. - + Lookup strategy: Uses article + collab + affiliation (if provided) to find existing record. If a record exists with these identifiers, it will be updated. Otherwise, a new one is created. - + Args: user: User creating/updating the instance article: Article instance (required) collab: Collaboration name (required) affiliation: ArticleAffiliation instance (optional, used for lookup) **kwargs: Additional field values - + Returns: ContribCollab instance (created or updated) - + Raises: ValueError: If article or collab is not provided """ if not article: - raise ValueError("ContribCollab.create_or_update requires article parameter") + raise ValueError( + "ContribCollab.create_or_update requires article parameter" + ) if not collab: raise ValueError("ContribCollab.create_or_update requires collab parameter") - + try: # Build lookup parameters lookup_params = {"article": article, "collab": collab} if affiliation: lookup_params["affiliation"] = affiliation - - obj = cls.get(article=article, collab=collab, affiliation=affiliation, **kwargs) - + + obj = cls.get( + article=article, collab=collab, affiliation=affiliation, **kwargs + ) + # Update fields if affiliation is not None: obj.affiliation = affiliation - + # Update any additional fields from kwargs for key, value in kwargs.items(): if hasattr(obj, key): setattr(obj, key, value) - + if user: obj.updated_by = user - + obj.save() return obj - + except cls.DoesNotExist: - return cls.create(user=user, article=article, collab=collab, affiliation=affiliation, **kwargs) + return cls.create( + user=user, + article=article, + collab=collab, + affiliation=affiliation, + **kwargs, + ) class ContribPerson(ResearchNameMixin, CommonControlField): """ Represents a person contributor associated with an article. - + This model tracks individual contributors to an article, including their personal information (name, ORCID, email) and affiliation details. - + Inherits from ResearchNameMixin (which provides name-related fields like given_names, last_name, suffix, fullname, declared_name) and CommonControlField (for audit fields). """ + article = ParentalKey( Article, on_delete=models.CASCADE, related_name="contrib_persons", verbose_name=_("Article"), ) - + affiliation = models.ForeignKey( ArticleAffiliation, on_delete=models.SET_NULL, @@ -2877,7 +2939,7 @@ class ContribPerson(ResearchNameMixin, CommonControlField): related_name="contrib_persons", verbose_name=_("Affiliation"), ) - + orcid = models.CharField( _("ORCID"), max_length=19, # ORCID format: 0000-0002-1825-0097 (19 chars: 16 digits + 3 hyphens) @@ -2885,14 +2947,14 @@ class ContribPerson(ResearchNameMixin, CommonControlField): blank=True, help_text=_("ORCID identifier (e.g., 0000-0002-1825-0097)"), ) - + email = models.EmailField( _("Email"), max_length=254, null=True, blank=True, ) - + panels = [ AutocompletePanel("article"), FieldPanel("declared_name"), @@ -2904,16 +2966,16 @@ class ContribPerson(ResearchNameMixin, CommonControlField): FieldPanel("email"), AutocompletePanel("affiliation"), ] - + base_form_class = CoreAdminModelForm - + class Meta: indexes = [ models.Index(fields=["article"]), models.Index(fields=["affiliation"]), models.Index(fields=["orcid"]), ] - + def __str__(self): parts = [str(self.article)] if self.names: @@ -2921,65 +2983,80 @@ def __str__(self): if self.affiliation: parts.append(str(self.affiliation)) return " - ".join(parts) - + def get_formatted_fullname(self, use_comma_separator=True, suffix_position="end"): """ Get formatted full name from name components. - + Args: use_comma_separator: If True, adds comma separator after last_name and other parts (default: True) suffix_position: Position of suffix - "end" for "last_name, suffix, given_names" (default) or "after_given" for "last_name, given_names, suffix" - + Returns: Formatted name string or None if no name components available - + Raises: ValueError: If suffix_position is not "end" or "after_given" """ if suffix_position not in ("end", "after_given"): - raise ValueError(f"suffix_position must be 'end' or 'after_given', got '{suffix_position}'") - + raise ValueError( + f"suffix_position must be 'end' or 'after_given', got '{suffix_position}'" + ) + if not any([self.last_name, self.given_names, self.suffix]): return None - + parts = [] - + if self.last_name: parts.append(self.last_name) - + if suffix_position == "end" and self.suffix: parts.append(self.suffix) - + if self.given_names: parts.append(self.given_names) - + if suffix_position == "after_given" and self.suffix: parts.append(self.suffix) - + sep = ", " if use_comma_separator else " " return sep.join(parts) - + @property def names(self): """ Get the best available name representation. - + Returns fullname if available, otherwise declared_name if available, otherwise returns formatted name from components without comma separators (e.g., "Silva Jr Paulo" instead of "Silva, Jr, Paulo"). - + Returns: Name string or None if no name information available """ - return self.fullname or self.declared_name or self.get_formatted_fullname(use_comma_separator=False, suffix_position="end") - + return ( + self.fullname + or self.declared_name + or self.get_formatted_fullname( + use_comma_separator=False, suffix_position="end" + ) + ) + @classmethod - def get(cls, article, declared_name=None, orcid=None, given_names=None, - last_name=None, suffix=None): + def get( + cls, + article, + declared_name=None, + orcid=None, + given_names=None, + last_name=None, + suffix=None, + ): """ Get a contrib person by article and identifying parameters. - + Args: article: Article instance (required) declared_name: Declared name of the person (optional) @@ -2987,17 +3064,17 @@ def get(cls, article, declared_name=None, orcid=None, given_names=None, given_names: Given names (optional) last_name: Last name (optional) suffix: Name suffix (optional) - + Returns: ContribPerson instance - + Raises: ValueError: If article is not provided cls.DoesNotExist: If no matching instance found """ if not article: raise ValueError("ContribPerson.get requires article parameter") - + try: return cls.objects.get( article=article, @@ -3005,7 +3082,7 @@ def get(cls, article, declared_name=None, orcid=None, given_names=None, orcid=orcid, given_names=given_names, last_name=last_name, - suffix=suffix + suffix=suffix, ) except cls.MultipleObjectsReturned: return cls.objects.filter( @@ -3014,16 +3091,25 @@ def get(cls, article, declared_name=None, orcid=None, given_names=None, orcid=orcid, given_names=given_names, last_name=last_name, - suffix=suffix + suffix=suffix, ).first() - + @classmethod - def create(cls, user, article, declared_name=None, given_names=None, - last_name=None, suffix=None, orcid=None, email=None, - affiliation=None): + def create( + cls, + user, + article, + declared_name=None, + given_names=None, + last_name=None, + suffix=None, + orcid=None, + email=None, + affiliation=None, + ): """ Create a new contrib person. - + Args: user: User creating the instance article: Article instance (required) @@ -3034,16 +3120,16 @@ def create(cls, user, article, declared_name=None, given_names=None, orcid: ORCID identifier (optional) email: Email address (optional) affiliation: ArticleAffiliation instance (optional) - + Returns: New ContribPerson instance - + Raises: ValueError: If article is not provided """ if not article: raise ValueError("ContribPerson.create requires article parameter") - + obj = cls() obj.article = article if declared_name is not None: @@ -3060,28 +3146,39 @@ def create(cls, user, article, declared_name=None, given_names=None, obj.email = email if affiliation is not None: obj.affiliation = affiliation - + if user: obj.creator = user - + try: obj.save() return obj except IntegrityError: - return cls.get(article, declared_name, orcid, given_names, last_name, suffix) - + return cls.get( + article, declared_name, orcid, given_names, last_name, suffix + ) + @classmethod - def create_or_update(cls, user, article, declared_name=None, given_names=None, - last_name=None, suffix=None, orcid=None, email=None, - affiliation=None): + def create_or_update( + cls, + user, + article, + declared_name=None, + given_names=None, + last_name=None, + suffix=None, + orcid=None, + email=None, + affiliation=None, + ): """ Create a new contrib person or update an existing one. - - Lookup strategy: Uses article + declared_name + orcid + given_names + - last_name + suffix (when provided) to find existing record. If a record - exists with these identifiers, it will be updated. Otherwise, a new one + + Lookup strategy: Uses article + declared_name + orcid + given_names + + last_name + suffix (when provided) to find existing record. If a record + exists with these identifiers, it will be updated. Otherwise, a new one is created. - + Args: user: User creating/updating the instance article: Article instance (required) @@ -3092,19 +3189,21 @@ def create_or_update(cls, user, article, declared_name=None, given_names=None, orcid: ORCID identifier (optional) email: Email address (optional) affiliation: ArticleAffiliation instance (optional) - + Returns: ContribPerson instance (created or updated) - + Raises: ValueError: If article is not provided """ if not article: - raise ValueError("ContribPerson.create_or_update requires article parameter") - + raise ValueError( + "ContribPerson.create_or_update requires article parameter" + ) + try: obj = cls.get(article, declared_name, orcid, given_names, last_name, suffix) - + # Update fields (including those used in lookup for consistency) if declared_name is not None: obj.declared_name = declared_name @@ -3120,13 +3219,13 @@ def create_or_update(cls, user, article, declared_name=None, given_names=None, obj.email = email if affiliation is not None: obj.affiliation = affiliation - + if user: obj.updated_by = user - + obj.save() return obj - + except cls.DoesNotExist: return cls.create( user=user, @@ -3137,17 +3236,17 @@ def create_or_update(cls, user, article, declared_name=None, given_names=None, suffix=suffix, orcid=orcid, email=email, - affiliation=affiliation + affiliation=affiliation, ) - + def add_orcid(self, user, orcid): """ Add or update the ORCID identifier for this contributor. - + Args: user: User performing the operation orcid: ORCID identifier string - + Returns: The updated ContribPerson instance """ @@ -3155,18 +3254,27 @@ def add_orcid(self, user, orcid): self.updated_by = user self.save() return self - - def add_raw_affiliation(self, user, raw_text=None, raw_institution_name=None, - raw_country_name=None, raw_country_code=None, - raw_state_name=None, raw_state_acron=None, - raw_city_name=None, raw_level_1=None, - raw_level_2=None, raw_level_3=None): + + def add_raw_affiliation( + self, + user, + raw_text=None, + raw_institution_name=None, + raw_country_name=None, + raw_country_code=None, + raw_state_name=None, + raw_state_acron=None, + raw_city_name=None, + raw_level_1=None, + raw_level_2=None, + raw_level_3=None, + ): """ Add or update raw affiliation data for this contributor. - + Creates or updates an ArticleAffiliation with raw affiliation information and links it to this ContribPerson. - + Args: user: User performing the operation raw_text: Raw affiliation text (optional) @@ -3179,7 +3287,7 @@ def add_raw_affiliation(self, user, raw_text=None, raw_institution_name=None, raw_level_1: Raw first level division (optional) raw_level_2: Raw second level division (optional) raw_level_3: Raw third level division (optional) - + Returns: The updated ContribPerson instance """ @@ -3205,29 +3313,34 @@ def add_raw_affiliation(self, user, raw_text=None, raw_institution_name=None, aff_kwargs["raw_level_2"] = raw_level_2 if raw_level_3: aff_kwargs["raw_level_3"] = raw_level_3 - + # Create or update the affiliation affiliation = ArticleAffiliation.create_or_update( - user=user, - article=self.article, - **aff_kwargs + user=user, article=self.article, **aff_kwargs ) - + # Link it to this contrib person self.affiliation = affiliation self.updated_by = user self.save() return self - - def add_normalized_affiliation(self, user, organization=None, location=None, - level_1=None, level_2=None, level_3=None): + + def add_normalized_affiliation( + self, + user, + organization=None, + location=None, + level_1=None, + level_2=None, + level_3=None, + ): """ Add normalized affiliation data to this contributor's affiliation. - + This method completes the affiliation.normalized field by creating or updating a NormAffiliation and linking it to the ArticleAffiliation. If this ContribPerson doesn't have an affiliation yet, one will be created. - + Args: user: User performing the operation organization: Organization instance (optional) @@ -3235,19 +3348,18 @@ def add_normalized_affiliation(self, user, organization=None, location=None, level_1: First level of organization division (optional) level_2: Second level of organization division (optional) level_3: Third level of organization division (optional) - + Returns: The updated ContribPerson instance """ # If no affiliation exists, create one first if not self.affiliation: self.affiliation = ArticleAffiliation.create( - user=user, - article=self.article + user=user, article=self.article ) # Save to persist the relationship before using it self.save() - + # Add normalized affiliation to the ArticleAffiliation self.affiliation.add_normalized_affiliation( user=user, @@ -3255,9 +3367,9 @@ def add_normalized_affiliation(self, user, organization=None, location=None, location=location, level_1=level_1, level_2=level_2, - level_3=level_3 + level_3=level_3, ) - + self.updated_by = user self.save() return self @@ -3378,11 +3490,12 @@ class RelatedArticle(CommonControlField): verbose_name=_("Related Article"), help_text=_("The related article instance, if available in the system."), ) + class Meta: - unique_together = [('article', 'href', 'related_type')] + unique_together = [("article", "href", "related_type")] indexes = [ - models.Index(fields=['article', 'href', 'related_type']), - models.Index(fields=['href']), + models.Index(fields=["article", "href", "related_type"]), + models.Index(fields=["href"]), ] verbose_name = _("Related Article") verbose_name_plural = _("Related Articles") @@ -3402,10 +3515,12 @@ def __str__(self): @property def data(self): return { - 'href': self.href, - 'ext_link_type': self.ext_link_type, - 'related_type': self.related_type, - 'related_article_id': self.related_article.id if self.related_article else None, + "href": self.href, + "ext_link_type": self.ext_link_type, + "related_type": self.related_type, + "related_article_id": self.related_article.id + if self.related_article + else None, } @classmethod @@ -3418,7 +3533,9 @@ def get(cls, article, related_type, href): ) @classmethod - def create(cls, user, article, href, ext_link_type, related_type, related_article=None): + def create( + cls, user, article, href, ext_link_type, related_type, related_article=None + ): """Cria um novo relacionamento entre artigos.""" if not user: raise ValueError("User is required") @@ -3445,11 +3562,15 @@ def create(cls, user, article, href, ext_link_type, related_type, related_articl return cls.get(article, related_type, href) @classmethod - def create_or_update(cls, user, article, href, ext_link_type, related_type, related_article=None): + def create_or_update( + cls, user, article, href, ext_link_type, related_type, related_article=None + ): """Obtém ou cria um relacionamento entre artigos.""" try: if not related_article and ext_link_type == "doi": - related_article = Article.objects.filter(doi__value__iexact=href).first() + related_article = Article.objects.filter( + doi__value__iexact=href + ).first() obj = cls.get(article, related_type, href) if obj.related_article != related_article: @@ -3458,7 +3579,9 @@ def create_or_update(cls, user, article, href, ext_link_type, related_type, rela obj.save() return obj except cls.DoesNotExist: - return cls.create(user, article, href, ext_link_type, related_type, related_article) + return cls.create( + user, article, href, ext_link_type, related_type, related_article + ) class ArticlePeerReviewStats(Article): @@ -3518,11 +3641,11 @@ class Meta: def get_queryset(self, request): """QuerySet otimizado com select_related e prefetch_related""" return self.objects.select_related( - 'journal', - 'issue', - 'journal__official', + "journal", + "issue", + "journal__official", ).prefetch_related( - 'doi', - 'titles', - 'languages', + "doi", + "titles", + "languages", ) From 3ca612c9e75f4997242bfc8689dd043c541aa53a Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Wed, 10 Jun 2026 15:52:20 -0300 Subject: [PATCH 02/13] black --- article/sources/xmlsps.py | 117 +++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 41 deletions(-) diff --git a/article/sources/xmlsps.py b/article/sources/xmlsps.py index 24624220..db21ace8 100755 --- a/article/sources/xmlsps.py +++ b/article/sources/xmlsps.py @@ -5,7 +5,6 @@ from itertools import product from django.utils.translation import gettext_lazy as _ -from lxml import etree from packtools.sps.models.article_abstract import ArticleAbstract from packtools.sps.models.article_and_subarticles import ArticleAndSubArticles from packtools.sps.models.article_contribs import ArticleContribs, XMLContribs @@ -33,19 +32,20 @@ DocumentAbstract, DocumentTitle, ) -from core.models import Language, LicenseStatement, License +from core.models import Language, License, LicenseStatement from core.utils.extracts_normalized_email import extracts_normalized_email from doi.models import DOI from institution.models import Sponsor -from issue.models import Issue, TableOfContents, AMIssue from issue.articlemeta.loader import load_issue_sections +from issue.models import AMIssue, Issue, TableOfContents from journal.models import Journal -from location.models import Location -from pid_provider.choices import PPXML_STATUS_UNMATCHED_JOURNAL_OR_ISSUE, PPXML_STATUS_INVALID -from pid_provider.models import PidProviderXML +from pid_provider.choices import ( + PPXML_STATUS_INVALID, + PPXML_STATUS_UNMATCHED_JOURNAL_OR_ISSUE, +) + # Researcher no longer used - replaced by ContribPerson # from researcher.models import Affiliation, Researcher -from tracker.models import UnexpectedEvent from vocabulary.models import Keyword @@ -102,9 +102,7 @@ def load_article(user, pp_xml): raise ValueError("User is required") if not pp_xml: - raise ValueError( - "load_article() requires params: pp_xml" - ) + raise ValueError("load_article() requires params: pp_xml") try: xml_with_pre = pp_xml.xml_with_pre @@ -126,10 +124,15 @@ def load_article(user, pp_xml): "timestamp": datetime.now().isoformat(), } ] - pp_xml.add_event(name="load_article", proc_status=PPXML_STATUS_INVALID, detail=detail, errors=errors, exceptions=e) + pp_xml.add_event( + name="load_article", + proc_status=PPXML_STATUS_INVALID, + detail=detail, + errors=errors, + exceptions=e, + ) raise ValueError(f"Unable to get XML to load article from {pp_xml}: {e}") - try: errors = [] article = None @@ -141,10 +144,12 @@ def load_article(user, pp_xml): sps_pkg_name = xml_with_pre.sps_pkg_name logging.info(f"Pid Provider XML: {pid_v3} {sps_pkg_name}") - + journal = get_journal(xmltree=xmltree, errors=errors) if not journal: - raise ValueError(f"Not found journal for pid provider xml: {pid_v3} {sps_pkg_name}") + raise ValueError( + f"Not found journal for pid provider xml: {pid_v3} {sps_pkg_name}" + ) issue = get_issue( xmltree=xmltree, journal=journal, @@ -152,7 +157,9 @@ def load_article(user, pp_xml): errors=errors, ) if not issue: - raise ValueError(f"Not found issue for pid provider xml: {pid_v3} {sps_pkg_name}") + raise ValueError( + f"Not found issue for pid provider xml: {pid_v3} {sps_pkg_name}" + ) # CRIAÇÃO/OBTENÇÃO DO OBJETO PRINCIPAL article = Article.create_or_update( @@ -181,9 +188,7 @@ def load_article(user, pp_xml): article.article_type = get_or_create_article_type( xmltree=xmltree, user=user, errors=errors ) - add_peer_review_dates( - xmltree=xmltree, article=article, errors=errors - ) + add_peer_review_dates(xmltree=xmltree, article=article, errors=errors) # FOREIGN KEYS SIMPLES article.journal = journal @@ -207,7 +212,9 @@ def load_article(user, pp_xml): article.languages.add(main_lang) article.sections.set( - get_or_create_toc_sections(xmltree=xmltree, user=user, errors=errors, issue=article.issue) + get_or_create_toc_sections( + xmltree=xmltree, user=user, errors=errors, issue=article.issue + ) ) article.titles.set( create_or_update_titles( @@ -262,8 +269,14 @@ def load_article(user, pp_xml): event.finish(errors=errors, exceptions=traceback.format_exc()) raise - pp_xml.add_event(name="load_article", proc_status=PPXML_STATUS_UNMATCHED_JOURNAL_OR_ISSUE, detail=detail, errors=errors, exceptions=e) - + pp_xml.add_event( + name="load_article", + proc_status=PPXML_STATUS_UNMATCHED_JOURNAL_OR_ISSUE, + detail=detail, + errors=errors, + exceptions=e, + ) + raise @@ -291,18 +304,38 @@ def add_peer_review_dates(xmltree, article, errors): article.accepted_dateiso = peer_review_stats.get("accepted_date") # Extrair intervalos em dias - article.days_preprint_to_received = peer_review_stats.get("days_from_preprint_to_received") - article.days_received_to_accepted = peer_review_stats.get("days_from_received_to_accepted") - article.days_accepted_to_published = peer_review_stats.get("days_from_accepted_to_published") - article.days_preprint_to_published = peer_review_stats.get("days_from_preprint_to_published") - article.days_receive_to_published = peer_review_stats.get("days_from_received_to_published") + article.days_preprint_to_received = peer_review_stats.get( + "days_from_preprint_to_received" + ) + article.days_received_to_accepted = peer_review_stats.get( + "days_from_received_to_accepted" + ) + article.days_accepted_to_published = peer_review_stats.get( + "days_from_accepted_to_published" + ) + article.days_preprint_to_published = peer_review_stats.get( + "days_from_preprint_to_published" + ) + article.days_receive_to_published = peer_review_stats.get( + "days_from_received_to_published" + ) # Extrair flags de estimativa - article.days_preprint_to_received_estimated = peer_review_stats.get("estimated_days_from_preprint_to_received") - article.days_received_to_accepted_estimated = peer_review_stats.get("estimated_days_from_received_to_accepted") - article.days_accepted_to_published_estimated = peer_review_stats.get("estimated_days_from_accepted_to_published") - article.days_preprint_to_published_estimated = peer_review_stats.get("estimated_days_from_preprint_to_published") - article.days_receive_to_published_estimated = peer_review_stats.get("estimated_days_from_received_to_published") + article.days_preprint_to_received_estimated = peer_review_stats.get( + "estimated_days_from_preprint_to_received" + ) + article.days_received_to_accepted_estimated = peer_review_stats.get( + "estimated_days_from_received_to_accepted" + ) + article.days_accepted_to_published_estimated = peer_review_stats.get( + "estimated_days_from_accepted_to_published" + ) + article.days_preprint_to_published_estimated = peer_review_stats.get( + "estimated_days_from_preprint_to_published" + ) + article.days_receive_to_published_estimated = peer_review_stats.get( + "estimated_days_from_received_to_published" + ) except Exception as e: add_error(errors, "add_peer_review_dates", e) @@ -356,9 +389,7 @@ def add_data_availability_status(xmltree, errors, article, user): for item in items: DataAvailabilityStatement.create_or_update( - user=user, - article=article, - **item + user=user, article=article, **item ) except Exception as e: add_error(errors, "add_data_availability_status", e) @@ -510,9 +541,13 @@ def get_or_create_toc_sections(xmltree, user, errors, issue): if not section_title: continue try: - issue_sections = TableOfContents.get_items_by_title(issue=issue, title=section_title) + issue_sections = TableOfContents.get_items_by_title( + issue=issue, title=section_title + ) if not issue_sections.exists(): - raise TableOfContents.DoesNotExist(f"Unable to find TOC section {section_title} for issue {issue}") + raise TableOfContents.DoesNotExist( + f"Unable to find TOC section {section_title} for issue {issue}" + ) for obj in issue_sections: data.append(obj) except Exception as e: @@ -687,9 +722,9 @@ def create_or_update_contrib_persons(xmltree, article, user, item, errors): ) data.append(obj) else: - # When an author has multiple affiliations in XML, we create one - # ContribPerson record per affiliation. This is intentional as per - # SciELO's data model where each author-affiliation combination + # When an author has multiple affiliations in XML, we create one + # ContribPerson record per affiliation. This is intentional as per + # SciELO's data model where each author-affiliation combination # should be tracked separately. for aff in affs: raw_email = author.get("email") or aff.get("email") @@ -1042,7 +1077,7 @@ def add_related_articles(xmltree, article, user, errors): user=user, href=href, ext_link_type=ext_link_type, - related_type=related_type + related_type=related_type, ) except Exception as e: @@ -1050,7 +1085,7 @@ def add_related_articles(xmltree, article, user, errors): errors, "add_related_articles.process_item", e, - related_article_data=related_article_data + related_article_data=related_article_data, ) except Exception as e: From 9747da9bd91cf8c01cf94b07cbedc652908cdbdf Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Wed, 10 Jun 2026 16:21:57 -0300 Subject: [PATCH 03/13] black --- issue/tasks.py | 251 +++++++++++++++++++++++++++---------------------- 1 file changed, 140 insertions(+), 111 deletions(-) diff --git a/issue/tasks.py b/issue/tasks.py index fd12abc1..f0092cfe 100644 --- a/issue/tasks.py +++ b/issue/tasks.py @@ -3,16 +3,22 @@ from django.contrib.auth import get_user_model +from collection.models import Collection from config import celery_app from core.utils.utils import _get_user -from collection.models import Collection from issue import controller -from issue.articlemeta.loader import harvest_issue_identifiers, harvest_and_load_issue -from issue.articlemeta.loader import create_issue_from_am_issue, load_issue_sections, load_issue_titles, load_bibliographic_strips, get_issue_data_from_am_issue +from issue.articlemeta.loader import ( + create_issue_from_am_issue, + get_issue_data_from_am_issue, + harvest_and_load_issue, + harvest_issue_identifiers, + load_bibliographic_strips, + load_issue_sections, + load_issue_titles, +) from issue.models import AMIssue from tracker.models import UnexpectedEvent - User = get_user_model() logger = logging.getLogger(__name__) @@ -32,7 +38,7 @@ def load_issue_from_articlemeta( ): """ Carrega issues do ArticleMeta para collections específicas. - + Args: user_id: ID do usuário username: Nome do usuário @@ -47,18 +53,20 @@ def load_issue_from_articlemeta( # Obter lista de acrônimos das collections collection_acronyms = Collection.get_acronyms(collection_acron) - + for acron3 in collection_acronyms: try: logger.info(f"Harvesting issues for collection {acron3}") - + # Coletar identificadores de issues for issue_identifier in harvest_issue_identifiers( acron3, from_date, until_date, force_update, timeout, verify ): try: - logger.info(f"Scheduling load for issue {issue_identifier.get('code')} in collection {acron3}") - + logger.info( + f"Scheduling load for issue {issue_identifier.get('code')} in collection {acron3}" + ) + # Agendar task para carregar issue específico task_harvest_and_load_issue.delay( user_id=user.id, @@ -78,7 +86,7 @@ def load_issue_from_articlemeta( "collection_acron": acron3, "issue_identifier": issue_identifier, "force_update": force_update, - } + }, ) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() @@ -91,7 +99,7 @@ def load_issue_from_articlemeta( "from_date": from_date, "until_date": until_date, "force_update": force_update, - } + }, ) except Exception as e: @@ -104,8 +112,8 @@ def load_issue_from_articlemeta( "collection_acron": collection_acron, "from_date": from_date, "until_date": until_date, - "force_update": force_update - } + "force_update": force_update, + }, ) @@ -119,14 +127,13 @@ def task_harvest_and_load_issue( force_update=None, timeout=30, verify=False, - ): """ Carrega um issue específico do ArticleMeta. - + Args: user_id: ID do usuário - username: Nome do usuário + username: Nome do usuário collection_acron: Acrônimo da collection issue_identifier: Dados do identificador do issue force_update: Forçar atualização de registros existentes @@ -135,25 +142,25 @@ def task_harvest_and_load_issue( """ try: user = _get_user(request=self.request, user_id=user_id, username=username) - + # Validações if not issue_identifier: raise ValueError("issue_identifier is required") if not collection_acron: raise ValueError("collection_acron is required") - + # Extrair dados do identificador url = issue_identifier.get("url") - code = issue_identifier.get("code") + code = issue_identifier.get("code") processing_date = issue_identifier.get("processing_date") - + if not url: raise ValueError("URL is required in issue_identifier") if not code: raise ValueError("Code is required in issue_identifier") - + logger.info(f"Loading issue {code} from {url}") - + # Carregar issue issue = harvest_and_load_issue( user=user, @@ -165,13 +172,13 @@ def task_harvest_and_load_issue( timeout=timeout, verify=verify, ) - + if issue: logger.info(f"Successfully loaded issue {issue}") return issue.id else: logger.warning(f"Failed to load issue {code} from {url}") - + except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() UnexpectedEvent.create( @@ -182,7 +189,7 @@ def task_harvest_and_load_issue( "collection_acron": collection_acron, "issue_identifier": issue_identifier, "force_update": force_update, - } + }, ) # Re-raise para que Celery marque a task como falhada raise @@ -206,7 +213,7 @@ def task_export_issues_to_articlemeta( ): """ Export issues to ArticleMeta Database with flexible filtering. - + Args: collection_acron_list: List of collections to export journal_acron_list: Filter by journal acronyms @@ -220,16 +227,16 @@ def task_export_issues_to_articlemeta( from_date: Start date for filtering until_date: End date for filtering days_to_go_back: Number of days to go back from current date - + Returns: dict: Result of bulk export operation - + Raises: Exception: Any unexpected error during export """ try: user = _get_user(request=self.request, user_id=user_id, username=username) - + result = controller.bulk_export_issues_to_articlemeta( user, collection_acron_list, @@ -243,12 +250,12 @@ def task_export_issues_to_articlemeta( days_to_go_back, force_update, ) - + return result - + except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() - + UnexpectedEvent.create( exception=e, exc_traceback=exc_traceback, @@ -266,10 +273,10 @@ def task_export_issues_to_articlemeta( "days_to_go_back": days_to_go_back, "user_id": user_id, "username": username, - "task_id": self.request.id if hasattr(self.request, 'id') else None, + "task_id": self.request.id if hasattr(self.request, "id") else None, }, ) - + # Re-raise para que o Celery possa tratar a exceção adequadamente raise @@ -289,7 +296,7 @@ def task_export_issue_to_articlemeta( ): """ Export a single issue to ArticleMeta Database. - + Args: collection_acron: Collection acronym (required) journal_acron: Journal acronym (required) @@ -300,10 +307,10 @@ def task_export_issue_to_articlemeta( force_update: Force update existing records user_id: User ID for authentication username: Username for authentication - + Returns: dict: Result of bulk export operation - + Raises: ValueError: If required parameters are missing Exception: Any unexpected error during export @@ -314,16 +321,16 @@ def task_export_issue_to_articlemeta( raise ValueError("collection_acron is required") if not journal_acron: raise ValueError("journal_acron is required") - + # Validação adicional: pelo menos um identificador do issue deve ser fornecido if not any([publication_year, volume, number, supplement]): raise ValueError( "At least one issue identifier is required: " "publication_year, volume, number, or supplement" ) - + user = _get_user(request=self.request, user_id=user_id, username=username) - + result = controller.bulk_export_issues_to_articlemeta( user, collection_acron_list=[collection_acron], @@ -334,11 +341,11 @@ def task_export_issue_to_articlemeta( supplement=supplement, force_update=force_update, ) - - return result + + return result except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() - + UnexpectedEvent.create( exception=e, exc_traceback=exc_traceback, @@ -353,11 +360,11 @@ def task_export_issue_to_articlemeta( "force_update": force_update, "user_id": user_id, "username": username, - "task_id": self.request.id if hasattr(self.request, 'id') else None, + "task_id": self.request.id if hasattr(self.request, "id") else None, "error_type": "unexpected_error", }, ) - + # Re-raise para que o Celery possa tratar a exceção adequadamente raise @@ -381,7 +388,7 @@ def task_update_issues_from_amissue( ): """ Atualiza Issues a partir de registros AMIssue com filtros específicos. - + Args: user_id: ID do usuário username: Nome do usuário @@ -396,13 +403,13 @@ def task_update_issues_from_amissue( supplement: Suplemento para filtrar Issues force_update: Forçar atualização de Issues existentes only_without_new_record: Processar apenas AMIssue sem new_record associado - + Returns: dict: Resultado da operação com estatísticas """ try: user = _get_user(request=self.request, user_id=user_id, username=username) - + try: # Limpar AMIssue órfãos (sem data e sem URL) AMIssue.objects.filter(data__isnull=True, url__isnull=True).delete() @@ -411,32 +418,36 @@ def task_update_issues_from_amissue( # Construir filtros para AMIssue filters = {} - + if collection_acron: try: collection = Collection.objects.get(acron3=collection_acron) - filters['collection'] = collection + filters["collection"] = collection except Collection.DoesNotExist: - raise ValueError(f"Collection with acron3 '{collection_acron}' not found") - + raise ValueError( + f"Collection with acron3 '{collection_acron}' not found" + ) + if issue_status: - if issue_status not in ['pending', 'todo', 'done']: - raise ValueError("issue_status must be one of: 'pending', 'todo', 'done'") - filters['status'] = issue_status + if issue_status not in ["pending", "todo", "done"]: + raise ValueError( + "issue_status must be one of: 'pending', 'todo', 'done'" + ) + filters["status"] = issue_status if processing_date_from: - filters['processing_date__gte'] = processing_date_from - + filters["processing_date__gte"] = processing_date_from + if processing_date_until: - filters['processing_date__lte'] = processing_date_until - + filters["processing_date__lte"] = processing_date_until + if only_without_new_record: - filters['new_record__isnull'] = True - + filters["new_record__isnull"] = True + # Filtros adicionais baseados no PID (que contém informações do issue) if journal_pid: - filters['pid__icontains'] = journal_pid - + filters["pid__icontains"] = journal_pid + if not force_update: if not filters: # Garantir que algum filtro exista @@ -444,46 +455,48 @@ def task_update_issues_from_amissue( # Obter queryset de AMIssue am_issues = AMIssue.objects.filter(**filters) - + if not am_issues.exists(): result = { "processed": 0, "updated": 0, "created": 0, "errors": 0, - "message": "No AMIssue records found with the specified filters" + "message": "No AMIssue records found with the specified filters", } return result - + # Estatísticas de processamento stats = { "processed": 0, "updated": 0, "created": 0, "errors": 0, - "error_details": [] + "error_details": [], } - + logger.info(f"Found {am_issues.count()} AMIssue records to process") - + for am_issue in am_issues.iterator(): try: stats["processed"] += 1 - + # Extrair dados do AMIssue para aplicar filtros adicionais issue_data = get_issue_data_from_am_issue(am_issue, user) if not issue_data: stats["errors"] += 1 - stats["error_details"].append({ - "am_issue_id": am_issue.id, - "pid": am_issue.pid, - "error": "Unable to extract issue data from AMIssue" - }) + stats["error_details"].append( + { + "am_issue_id": am_issue.id, + "pid": am_issue.pid, + "error": "Unable to extract issue data from AMIssue", + } + ) continue - + # Aplicar filtros específicos do Issue should_skip = False - + if year and issue_data.get("year") != year: should_skip = True if volume and issue_data.get("volume") != volume: @@ -492,51 +505,65 @@ def task_update_issues_from_amissue( should_skip = True if supplement and issue_data.get("supplement") != supplement: should_skip = True - + if should_skip: continue - + # Verificar se já existe Issue para este AMIssue existing_issue = am_issue.new_record - + if existing_issue and not force_update: # Issue já existe e não estamos forçando update continue - + if existing_issue and force_update: # Atualizar Issue existente - logger.info(f"Updating existing Issue {existing_issue.id} from AMIssue {am_issue.id}") - + logger.info( + f"Updating existing Issue {existing_issue.id} from AMIssue {am_issue.id}" + ) + # Recarregar dados do AMIssue no Issue - load_issue_sections(user, existing_issue, am_issue, issue_data, collection=am_issue.collection) + load_issue_sections( + user, + existing_issue, + am_issue, + issue_data, + collection=am_issue.collection, + ) load_issue_titles(user, existing_issue, am_issue, issue_data) - load_bibliographic_strips(user, existing_issue, am_issue, issue_data) - + load_bibliographic_strips( + user, existing_issue, am_issue, issue_data + ) + stats["updated"] += 1 - + else: # Criar novo Issue logger.info(f"Creating new Issue from AMIssue {am_issue.id}") - + issue = create_issue_from_am_issue(user, am_issue) if issue: stats["created"] += 1 else: stats["errors"] += 1 - stats["error_details"].append({ - "am_issue_id": am_issue.id, - "pid": am_issue.pid, - "error": "Failed to create Issue from AMIssue" - }) - + stats["error_details"].append( + { + "am_issue_id": am_issue.id, + "pid": am_issue.pid, + "error": "Failed to create Issue from AMIssue", + } + ) + except Exception as e: stats["errors"] += 1 - stats["error_details"].append({ - "am_issue_id": am_issue.id, - "pid": getattr(am_issue, 'pid', None), - "error": str(e) - }) - + stats["error_details"].append( + { + "am_issue_id": am_issue.id, + "pid": getattr(am_issue, "pid", None), + "error": str(e), + } + ) + # Log individual errors but continue processing exc_type, exc_value, exc_traceback = sys.exc_info() UnexpectedEvent.create( @@ -545,20 +572,22 @@ def task_update_issues_from_amissue( action="issue.tasks.task_update_issues_from_amissue.process_am_issue", detail={ "am_issue_id": am_issue.id, - "pid": getattr(am_issue, 'pid', None), + "pid": getattr(am_issue, "pid", None), "collection_acron": collection_acron, "force_update": force_update, - } + }, ) continue - + # Log do resultado final - logger.info(f"Task completed. Processed: {stats['processed']}, " - f"Created: {stats['created']}, Updated: {stats['updated']}, " - f"Errors: {stats['errors']}") - + logger.info( + f"Task completed. Processed: {stats['processed']}, " + f"Created: {stats['created']}, Updated: {stats['updated']}, " + f"Errors: {stats['errors']}" + ) + return stats - + except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() UnexpectedEvent.create( @@ -579,9 +608,9 @@ def task_update_issues_from_amissue( "only_without_new_record": only_without_new_record, "user_id": user_id, "username": username, - "task_id": self.request.id if hasattr(self.request, 'id') else None, + "task_id": self.request.id if hasattr(self.request, "id") else None, }, ) - + # Re-raise para que o Celery possa tratar a exceção adequadamente - raise \ No newline at end of file + raise From 44f6376c27e5dc0bb8f0855fd3479e6d355bae4d Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:01:24 -0300 Subject: [PATCH 04/13] fix key in _iter_from_article --- article/controller.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/article/controller.py b/article/controller.py index 48ede7f3..430c71db 100644 --- a/article/controller.py +++ b/article/controller.py @@ -369,7 +369,7 @@ def bulk_export_articles_to_articlemeta( "article_id": article.id, "article_pid": getattr(article, "pid", None), "journal_acron": getattr(article, "journal_acron", None), - "pub_year": getattr(article, "pub_year", None), + "pub_date_year": getattr(article, "pub_date_year", None), "force_update": force_update, }, ) @@ -524,9 +524,9 @@ def _iter_from_article(self): if journal_id_list: filters["journal__in"] = journal_id_list if self.from_pub_year: - filters["pub_year__gte"] = self.from_pub_year + filters["pub_date_year__gte"] = self.from_pub_year if self.until_pub_year: - filters["pub_year__lte"] = self.until_pub_year + filters["pub_date_year__lte"] = self.until_pub_year if self.from_date: filters["updated__gte"] = self.from_date if self.until_date: @@ -554,9 +554,7 @@ def _iter_from_harvest(self): count = 0 for collection_acron in self.collection_acron_list or list(Collection.get_acronyms()): - logging.info(collection_acron) harvester = self._build_harvester(collection_acron) - logging.info(harvester) for document in harvester.harvest_documents(): count += 1 yield { @@ -565,7 +563,7 @@ def _iter_from_harvest(self): "pid": document["pid_v2"], "source_date": document.get("processing_date") or document.get("origin_date"), } - + self._iter_from_harvest_count = count logging.info(f"Harvest iterator yielded {count} documents") @@ -596,6 +594,7 @@ def _build_harvester(self, collection_acron): timeout=self.timeout, ) if collection_acron == "scl": - return OPACHarvester(self.opac_url or "www.scielo.br", collection_acron, **kwargs) + domain = self.opac_url or Collection.get(collection_acron).base_url + return OPACHarvester(domain, collection_acron, **kwargs) return AMHarvester("article", collection_acron, **kwargs) From 90ded23ffb39aecd8730d30cd363c925c7c07d0e Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:01:54 -0300 Subject: [PATCH 05/13] remove comentario --- article/controller.py | 125 ++++++++++++------------------------------ 1 file changed, 36 insertions(+), 89 deletions(-) diff --git a/article/controller.py b/article/controller.py index 430c71db..fc81621b 100644 --- a/article/controller.py +++ b/article/controller.py @@ -6,16 +6,16 @@ from packtools.sps.formats.am import am -from article.models import Article, ArticleExporter, ArticleFunding, ArticleSource from article import choices +from article.models import Article, ArticleExporter, ArticleFunding, ArticleSource from collection.models import Collection from core.mongodb import write_item from core.utils.harvesters import AMHarvester, OPACHarvester from institution.models import Sponsor from journal.models import Journal from pid_provider.choices import ( - PPXML_STATUS_TODO, PPXML_STATUS_INVALID, + PPXML_STATUS_TODO, ) from pid_provider.models import PidProviderXML from tracker.models import UnexpectedEvent @@ -24,70 +24,6 @@ class ArticleIsNotAvailableError(Exception): ... -# def get_pp_xml_ids( -# collection_acron_list=None, -# journal_acron_list=None, -# from_pub_year=None, -# until_pub_year=None, -# from_updated_date=None, -# until_updated_date=None, -# proc_status_list=None, -# ): -# return select_pp_xml( -# collection_acron_list, -# journal_acron_list, -# from_pub_year, -# until_pub_year, -# from_updated_date, -# until_updated_date, -# proc_status_list=proc_status_list, -# ).values_list("id", flat=True) - - -# def select_pp_xml( -# collection_acron_list=None, -# journal_acron_list=None, -# from_pub_year=None, -# until_pub_year=None, -# from_updated_date=None, -# until_updated_date=None, -# proc_status_list=None, -# params=None, -# ): -# params = params or {} - -# q = Q() -# if journal_acron_list or collection_acron_list: -# issns = Journal.get_issn_list(collection_acron_list, journal_acron_list) -# issn_print_list = issns["issn_print_list"] -# issn_electronic_list = issns["issn_electronic_list"] - -# if issn_print_list or issn_electronic_list: -# q = Q(issn_print__in=issn_print_list) | Q( -# issn_electronic__in=issn_electronic_list -# ) -# elif issn_print_list: -# q = Q(issn_print__in=issn_print_list) -# elif issn_electronic_list: -# q = Q(issn_electronic__in=issn_electronic_list) - -# if from_updated_date: -# params["updated__gte"] = from_updated_date -# if until_updated_date: -# params["updated__lte"] = until_updated_date - -# if from_pub_year: -# params["pub_year__gte"] = from_pub_year -# if until_pub_year: -# params["pub_year__lte"] = until_pub_year - -# if proc_status_list: -# params["proc_status__in"] = proc_status_list - -# logging.info(params) -# return PidProviderXML.objects.filter(q, **params) - - def load_financial_data(row, user): article_findings = [] for institution in row.get("funding_source").split(","): @@ -143,9 +79,9 @@ def export_article_to_articlemeta( logging.info( f"export_article_to_articlemeta: {article}, collections: {collection_acron_list}, force_update: {force_update}" ) - legacy_keys_items = list(article.get_legacy_keys( - collection_acron_list, is_active=True - )) + legacy_keys_items = list( + article.get_legacy_keys(collection_acron_list, is_active=True) + ) logging.info(f"Legacy keys to process: {legacy_keys_items}") if not legacy_keys_items: UnexpectedEvent.create( @@ -172,7 +108,7 @@ def export_article_to_articlemeta( external_data["pid_v3"] = article.pid_v3 text_langs = article.get_text_langs() - + article_data = {} for legacy_keys in legacy_keys_items: try: @@ -326,7 +262,7 @@ def bulk_export_articles_to_articlemeta( until_pub_year=until_pub_year, from_updated_date=from_date, until_updated_date=until_date, - params=params + params=params, ) if not queryset.exists(): UnexpectedEvent.create( @@ -345,7 +281,9 @@ def bulk_export_articles_to_articlemeta( ) return False - for article in queryset.select_related("journal", "journal__official", "pp_xml").iterator(): + for article in queryset.select_related( + "journal", "journal__official", "pp_xml" + ).iterator(): try: if force_update: article.check_availability(user) @@ -374,9 +312,9 @@ def bulk_export_articles_to_articlemeta( }, ) continue - + return True - + except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() UnexpectedEvent.create( @@ -473,10 +411,12 @@ def __iter__(self): yield from self._iter_from_pid_provider() yield from self._iter_from_article() - logging.info(f"Iterators summary: harvest={self._iter_from_harvest_count}, " - f"article_source={self._iter_from_article_source_count}, " - f"pid_provider={self._iter_from_pid_provider_count}, " - f"article={self._iter_from_article_count}") + logging.info( + f"Iterators summary: harvest={self._iter_from_harvest_count}, " + f"article_source={self._iter_from_article_source_count}, " + f"pid_provider={self._iter_from_pid_provider_count}, " + f"article={self._iter_from_article_count}" + ) # ------------------------------------------------------------------ # Iteradores de seleção @@ -484,10 +424,9 @@ def __iter__(self): def _iter_from_pid_provider(self): """Itera PidProviderXML filtrados por periódico, data e status.""" - journal_issn_groups = ( - Journal.get_journal_issns(self.collection_acron_list, self.journal_acron_list) - or [None] - ) + journal_issn_groups = Journal.get_journal_issns( + self.collection_acron_list, self.journal_acron_list + ) or [None] for journal_issns in journal_issn_groups: issn_list = [i for i in journal_issns if i] if journal_issns else None if journal_issns and not issn_list: @@ -498,12 +437,15 @@ def _iter_from_pid_provider(self): until_pub_year=self.until_pub_year, from_updated_date=self.from_date, until_updated_date=self.until_date, - proc_status_list=self.proc_status_list or [PPXML_STATUS_TODO, PPXML_STATUS_INVALID], + proc_status_list=self.proc_status_list + or [PPXML_STATUS_TODO, PPXML_STATUS_INVALID], ) self._iter_from_pid_provider_count += qs.count() for item in qs.iterator(): yield {"pp_xml_id": item.id} - logging.info(f"_iter_from_pid_provider: yielded {self._iter_from_pid_provider_count} items") + logging.info( + f"_iter_from_pid_provider: yielded {self._iter_from_pid_provider_count} items" + ) def _iter_from_article(self): """ @@ -511,7 +453,8 @@ def _iter_from_article(self): Yields None para artigos sem pp_xml recuperável (sinaliza skip). """ filters = { - "data_status__in": self.data_status_list or [ + "data_status__in": self.data_status_list + or [ choices.DATA_STATUS_PENDING, choices.DATA_STATUS_UNDEF, choices.DATA_STATUS_INVALID, @@ -544,7 +487,9 @@ def _iter_from_article(self): yield None continue yield {"pp_xml_id": article.pp_xml.id} - logging.info(f"_iter_from_article: yielded {self._iter_from_article_count} articles") + logging.info( + f"_iter_from_article: yielded {self._iter_from_article_count} articles" + ) def _iter_from_harvest(self): """Itera documentos coletados via OPAC ou ArticleMeta.""" @@ -553,7 +498,9 @@ def _iter_from_harvest(self): Collection.load(self.user) count = 0 - for collection_acron in self.collection_acron_list or list(Collection.get_acronyms()): + for collection_acron in self.collection_acron_list or list( + Collection.get_acronyms() + ): harvester = self._build_harvester(collection_acron) for document in harvester.harvest_documents(): count += 1 @@ -561,7 +508,8 @@ def _iter_from_harvest(self): "xml_url": document["url"], "collection_acron": collection_acron, "pid": document["pid_v2"], - "source_date": document.get("processing_date") or document.get("origin_date"), + "source_date": document.get("processing_date") + or document.get("origin_date"), } self._iter_from_harvest_count = count @@ -597,4 +545,3 @@ def _build_harvester(self, collection_acron): domain = self.opac_url or Collection.get(collection_acron).base_url return OPACHarvester(domain, collection_acron, **kwargs) return AMHarvester("article", collection_acron, **kwargs) - From 76ff7f93290ca0f6caa1f4905c3bf4916fa87a18 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:02:14 -0300 Subject: [PATCH 06/13] Add save in Articlesource.create --- article/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/article/models.py b/article/models.py index 2469175e..a330e790 100755 --- a/article/models.py +++ b/article/models.py @@ -1832,6 +1832,7 @@ def create( obj.source_date = source_date obj.am_article = am_article obj.status = cls.StatusChoices.PENDING + obj.save() obj.add_pid_provider( user, force_update, auto_solve_pid_conflict=auto_solve_pid_conflict ) From bc18ed4f7cf484731da8b7825bbc4ed758dc958b Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:02:58 -0300 Subject: [PATCH 07/13] Add protocolo em url se nao houver em task_process_article_pipeline --- article/tasks.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/article/tasks.py b/article/tasks.py index 4905ca04..9e4c4bcf 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -854,7 +854,6 @@ def task_process_article_pipeline( collection_acron_list=None, force_update=None, auto_solve_pid_conflict=None, - version=None, user_id=None, username=None, ): @@ -863,7 +862,7 @@ def task_process_article_pipeline( Implementa um pipeline flexível que pode iniciar em diferentes estágios: - Fluxo A: XML URL → ArticleSource → PidProviderXML → Article - - Fluxo B: ArticleSource existente → PidProviderXML → Article + - Fluxo B: ArticleSource existente → PidProviderXML → Article - Fluxo C: PidProviderXML → Article (entrada direta) Args: @@ -920,12 +919,17 @@ def task_process_article_pipeline( """ try: user = _get_user(self.request, username=username, user_id=user_id) - + article_source = None + pp_xml = None + if xml_url: if not collection_acron: raise ValueError("collection_acron is required when xml_url is provided") if not pid: raise ValueError("pid is required when xml_url is provided") + if not xml_url.startswith(("http://", "https://")): + xml_url = f"https://{xml_url}" + am_article = AMArticle.create_or_update( pid, Collection.get(collection_acron), None, user ) @@ -942,8 +946,7 @@ def task_process_article_pipeline( am_article=am_article, auto_solve_pid_conflict=auto_solve_pid_conflict, ) - pp_xml_id = article_source.pid_provider_xml.id - + if article_source_id: article_source = ArticleSource.objects.get(id=article_source_id) article_source.add_pid_provider( @@ -951,23 +954,19 @@ def task_process_article_pipeline( force_update=force_update, auto_solve_pid_conflict=auto_solve_pid_conflict, ) - pp_xml_id = article_source.pid_provider_xml.id - if not pp_xml_id: + pp_xml = article_source.pid_provider_xml + if not pp_xml: raise ValueError( "No valid entry point provided. Please provide either xml_url, " "article_source_id, pp_xml_id or pid_v3." ) - pp_xml = PidProviderXML.objects.select_related( - "current_version" - ).get(id=pp_xml_id) - article = load_article(user, pp_xml=pp_xml) pp_xml.collections.set(article.collections) article.check_availability(user, force_update=export_to_articlemeta or force_update) - + if export_to_articlemeta: if not article.is_classic_public or not article.valid: logging.warning(f"Article {article.pid_v3} is not valid or not public. Skipping export to ArticleMeta.") From 1d0c6f7101438a7b2688504c152018ab0d943f7e Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:03:28 -0300 Subject: [PATCH 08/13] Black --- article/tasks.py | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/article/tasks.py b/article/tasks.py index 9e4c4bcf..deca5d24 100644 --- a/article/tasks.py +++ b/article/tasks.py @@ -5,7 +5,7 @@ from django.utils.translation import gettext_lazy as _ from article import controller -from article.models import Article, ArticleFormat, ArticleSource, AMArticle +from article.models import AMArticle, Article, ArticleFormat, ArticleSource from article.sources.preprint import harvest_preprints from article.sources.xmlsps import load_article from collection.models import Collection @@ -14,7 +14,6 @@ from core.utils.extracts_normalized_email import extracts_normalized_email from core.utils.utils import _get_user from journal.models import Journal -from pid_provider.models import PidProviderXML from researcher.models import ResearcherIdentifier from tracker.models import UnexpectedEvent @@ -50,7 +49,7 @@ def load_funding_data(user, file_path): controller.read_file(user, file_path) -@celery_app.task(bind=True, name=_('load_preprints')) +@celery_app.task(bind=True, name=_("load_preprints")) def load_preprint(self, user_id, oai_pmh_preprint_uri): """ Coleta e carrega preprints de um endpoint OAI-PMH específico. @@ -249,17 +248,19 @@ def transfer_license_statements_fk_to_article_license( instance.license = first.license if not instance.license and first.data: data = first.data - instance.license = License.create_or_update(user, license_type=data.get("license_type"), version=data.get("license_version")) - + instance.license = License.create_or_update( + user, + license_type=data.get("license_type"), + version=data.get("license_version"), + ) + if not instance.license: continue instance.updated_by = user articles_to_update.append(instance) if articles_to_update: - Article.objects.bulk_update( - articles_to_update, ["license", "updated_by"] - ) + Article.objects.bulk_update(articles_to_update, ["license", "updated_by"]) logging.info("The license of model Articles have been updated") @@ -267,7 +268,7 @@ def get_researcher_identifier_unnormalized(): """ Retorna identificadores de e-mail que não seguem formato padrão RFC 5322. - Filtra objetos ResearcherIdentifier que possuem source_name="EMAIL" + Filtra objetos ResearcherIdentifier que possuem source_name="EMAIL" mas cujo campo identifier não corresponde ao padrão de e-mail válido. Returns: @@ -407,12 +408,12 @@ def task_export_articles_to_articlemeta( days_to_go_back=days_to_go_back, force_update=force_update, ) - + return result - + except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() - + UnexpectedEvent.create( exception=e, exc_traceback=exc_traceback, @@ -429,10 +430,10 @@ def task_export_articles_to_articlemeta( "force_update": force_update, "user_id": user_id, "username": username, - "task_id": self.request.id if hasattr(self.request, 'id') else None, + "task_id": self.request.id if hasattr(self.request, "id") else None, }, ) - + # Re-raise para que o Celery possa tratar a exceção adequadamente raise @@ -502,7 +503,7 @@ def task_export_article_to_articlemeta( collection_acron_list=collection_acron_list, force_update=force_update, ) - except Article.DoesNotExist as exception: + except Article.DoesNotExist: return False except Exception as exception: exc_type, exc_value, exc_traceback = sys.exc_info() @@ -837,6 +838,7 @@ def task_dispatch_articles( ) raise + @celery_app.task(bind=True) def task_process_article_pipeline( self, @@ -900,7 +902,7 @@ def task_process_article_pipeline( # Fluxo completo a partir de URL task_process_article_pipeline.delay( xml_url="http://example.com/article.xml", - collection_acron="scl", + collection_acron="scl", pid="S1234-56782024000100001", export_to_articlemeta=True ) @@ -924,7 +926,9 @@ def task_process_article_pipeline( if xml_url: if not collection_acron: - raise ValueError("collection_acron is required when xml_url is provided") + raise ValueError( + "collection_acron is required when xml_url is provided" + ) if not pid: raise ValueError("pid is required when xml_url is provided") if not xml_url.startswith(("http://", "https://")): @@ -965,11 +969,15 @@ def task_process_article_pipeline( article = load_article(user, pp_xml=pp_xml) pp_xml.collections.set(article.collections) - article.check_availability(user, force_update=export_to_articlemeta or force_update) + article.check_availability( + user, force_update=export_to_articlemeta or force_update + ) if export_to_articlemeta: if not article.is_classic_public or not article.valid: - logging.warning(f"Article {article.pid_v3} is not valid or not public. Skipping export to ArticleMeta.") + logging.warning( + f"Article {article.pid_v3} is not valid or not public. Skipping export to ArticleMeta." + ) return task_export_article_to_articlemeta.delay( pid_v3=article.pid_v3, From 2fd657e01987f7a20d79e49f97fee22ddf59a6cd Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:17:01 -0300 Subject: [PATCH 09/13] remove code inutilizavel --- article/scripts/load_articles.py | 5 ----- article/sources/xmlsps.py | 14 -------------- bigbang/tasks_scheduler.py | 2 -- core/models.py | 4 ---- 4 files changed, 25 deletions(-) delete mode 100644 article/scripts/load_articles.py diff --git a/article/scripts/load_articles.py b/article/scripts/load_articles.py deleted file mode 100644 index 038ff6c6..00000000 --- a/article/scripts/load_articles.py +++ /dev/null @@ -1,5 +0,0 @@ -from article.tasks import load_articles - - -def run(username=None): - load_articles.apply_async(kwargs={"username": username}) diff --git a/article/sources/xmlsps.py b/article/sources/xmlsps.py index db21ace8..fa6f3f72 100755 --- a/article/sources/xmlsps.py +++ b/article/sources/xmlsps.py @@ -107,15 +107,6 @@ def load_article(user, pp_xml): try: xml_with_pre = pp_xml.xml_with_pre except Exception as e: - updated = ( - Article.objects.filter(pp_xml=pp_xml) - .exclude( - data_status=choices.DATA_STATUS_INVALID, - ) - .update( - data_status=choices.DATA_STATUS_INVALID, - ) - ) errors = [ { "function": "load_article", @@ -679,11 +670,6 @@ def create_or_update_contrib_persons(xmltree, article, user, item, errors): Returns: list: Lista de objetos ContribPerson criados """ - article_lang = None - try: - article_lang = ArticleAndSubArticles(xmltree=xmltree).main_lang - except Exception as e: - add_error(errors, "create_or_update_contrib_persons.get_main_lang", e) data = [] try: diff --git a/bigbang/tasks_scheduler.py b/bigbang/tasks_scheduler.py index 48aa115d..6c280120 100644 --- a/bigbang/tasks_scheduler.py +++ b/bigbang/tasks_scheduler.py @@ -25,7 +25,6 @@ def delete_outdated_tasks(task_list=None): task_list = task_list or [ # Tarefas de Article com namespace completo "article.tasks.load_article_from_pp_xml", - "article.tasks.load_articles", "article.tasks.task_mark_articles_as_deleted_without_pp_xml", "article.tasks.task_remove_duplicate_articles", "article.tasks.task_convert_xml_to_other_formats_for_articles", @@ -35,7 +34,6 @@ def delete_outdated_tasks(task_list=None): "article.tasks.task_select_articles_to_load_from_api", "article.tasks.task_select_articles_to_load_from_collection_endpoint", "article.tasks.task_select_articles_to_load_from_article_source", - "article.tasks.task_load_articles", "article.tasks.task_load_journal_articles", "article.tasks.task_load_article_from_xml_url", "article.tasks.task_create_article_source", diff --git a/core/models.py b/core/models.py index 2f7dc6ac..95cc8ab3 100755 --- a/core/models.py +++ b/core/models.py @@ -1284,10 +1284,6 @@ def __str__(self): def get(cls, pid, collection): if not pid and not collection: raise ValueError("Param pid and collection_acron3 is required") - try: - cls.objects.filter(url__isnull=True, data__isnull=True).delete() - except Exception: - pass try: return cls.objects.get(pid=pid, collection=collection) except cls.MultipleObjectsReturned: From 558c19c1dfa275ff4c0b086ef9ba421b04d9a088 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:17:28 -0300 Subject: [PATCH 10/13] padroniza url --- core/utils/harvesters.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/utils/harvesters.py b/core/utils/harvesters.py index 928b7233..297f4e61 100644 --- a/core/utils/harvesters.py +++ b/core/utils/harvesters.py @@ -3,6 +3,7 @@ from typing import Any, Dict, Generator, Optional from urllib.parse import urlencode +from article.utils.url_builder import ArticleURLBuilder from core.utils.utils import fetch_data @@ -162,7 +163,9 @@ def __init__( limit: Número de documentos por página timeout: Timeout em segundos para requisições """ - self.domain = domain + if not domain.startswith(("http://", "https://")): + domain = f"https://{domain}" + self.domain = domain.rstrip("/") self.collection_acron = collection_acron self.from_date = from_date or "2000-01-01" self.until_date = until_date or datetime.utcnow().isoformat()[:10] @@ -222,9 +225,10 @@ def harvest_documents(self) -> Generator[Dict[str, Any], None, None]: logging.warning(f"Invalid document data: {item}") continue - # Constrói URL do XML journal_acron = item["journal_acronym"] - xml_url = f"{self.domain}/j/{journal_acron}/a/{pid_v3}/?format=xml" + xml_url = ArticleURLBuilder( + self.domain, journal_acron, pid_v3=pid_v3 + ).get_xml_url() # Extrai data de origem origin_date = self._parse_gmt_date( From d021cc9fbc7c352dbe1cb902c4fd39ee5f7d244d Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:17:33 -0300 Subject: [PATCH 11/13] black --- journal/api/v1/views.py | 68 ++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/journal/api/v1/views.py b/journal/api/v1/views.py index 84177a24..9202d7b5 100644 --- a/journal/api/v1/views.py +++ b/journal/api/v1/views.py @@ -1,5 +1,5 @@ -from rest_framework import serializers, viewsets from django.db.models import F, Prefetch, Q +from rest_framework import serializers, viewsets from core.utils.utils import formated_date_api_params from core.validators import validate_params @@ -13,19 +13,19 @@ class Meta: model = models.Journal def to_representation(self, instance): - collection = getattr(instance, 'collection_acron', None) + collection = getattr(instance, "collection_acron", None) return instance.articlemeta_format(collection) - + class GenericJournalViewSet(viewsets.ModelViewSet): serializer_class = JournalSerializer http_method_names = ["get"] queryset = models.Journal.objects.prefetch_related( Prefetch( "crossmark_policy", - queryset=models.CrossmarkPolicy.objects - .select_related("language", "journal__official") - .prefetch_related( + queryset=models.CrossmarkPolicy.objects.select_related( + "language", "journal__official" + ).prefetch_related( Prefetch( "journal__scielojournal_set", queryset=models.SciELOJournal.objects.select_related("collection"), @@ -38,7 +38,7 @@ class GenericJournalViewSet(viewsets.ModelViewSet): class JournalViewSet(GenericJournalViewSet): def get_queryset(self): query_params = self.request.query_params - + validate_params( self.request, "issn_print", @@ -60,35 +60,37 @@ def get_queryset(self): params = {} if issn := query_params.get("issn"): - params["scielojournal__issn_scielo"] = issn + params["scielojournal__issn_scielo"] = issn if issn_electronic := query_params.get("issn_electronic"): - params["official__issn_electronic"] = issn_electronic + params["official__issn_electronic"] = issn_electronic if issn_print := query_params.get("issn_print"): - params["official__issn_print"] = issn_print + params["official__issn_print"] = issn_print if issnl := query_params.get("issnl"): - params["official__issnl"] = issnl + params["official__issnl"] = issnl if title := query_params.get("title"): - params["title"] = title + params["title"] = title if toc_item := query_params.get("toc_item"): - params["journaltocsection__toc_items__text"] = toc_item + params["journaltocsection__toc_items__text"] = toc_item if thematic_areas := query_params.get("thematic_areas"): - params["subject__value__in"] = thematic_areas.split(",") + params["subject__value__in"] = thematic_areas.split(",") if collection_acron := query_params.get("collection"): - params["scielojournal__collection__acron3"] = collection_acron - + params["scielojournal__collection__acron3"] = collection_acron + formated_date = formated_date_api_params(query_params) if formated_date: params.update(formated_date) query = super().get_queryset() if query_params.get("formats") == "articlemeta": - return query.filter( - scielojournal__journal__isnull=False - ).filter(**params).annotate( - collection_acron=F('scielojournal__collection__acron3') - ).order_by('created').distinct() - - return query.filter(**params).order_by('created').distinct() + return ( + query.filter(scielojournal__journal__isnull=False) + .filter(**params) + .annotate(collection_acron=F("scielojournal__collection__acron3")) + .order_by("created") + .distinct() + ) + + return query.filter(**params).order_by("created").distinct() def get_serializer_class(self): format_param = self.request.query_params.get("formats") @@ -96,6 +98,7 @@ def get_serializer_class(self): return ArticleMetaFormatSerializer return JournalSerializer + class CrossmarkPolicyViewSet(viewsets.ModelViewSet): """ API endpoint that exposes CrossmarkPolicy data for journals. @@ -163,14 +166,12 @@ class CrossmarkPolicyViewSet(viewsets.ModelViewSet): serializer_class = CrossmarkPolicySerializer http_method_names = ["get"] - queryset = ( - models.CrossmarkPolicy.objects - .select_related("language", "journal__official") - .prefetch_related( - Prefetch( - "journal__scielojournal_set", - queryset=models.SciELOJournal.objects.select_related("collection"), - ) + queryset = models.CrossmarkPolicy.objects.select_related( + "language", "journal__official" + ).prefetch_related( + Prefetch( + "journal__scielojournal_set", + queryset=models.SciELOJournal.objects.select_related("collection"), ) ) @@ -188,9 +189,8 @@ def get_queryset(self): params = {} issn_filter = None if issn := query_params.get("issn"): - issn_filter = ( - Q(journal__official__issn_electronic=issn) - | Q(journal__official__issn_print=issn) + issn_filter = Q(journal__official__issn_electronic=issn) | Q( + journal__official__issn_print=issn ) if collection := query_params.get("collection"): params["journal__scielojournal__collection__acron3"] = collection From 56d00b5e0b3f1ca10dc26ffba73a1be829225c88 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:21:39 -0300 Subject: [PATCH 12/13] black --- core/models.py | 84 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/core/models.py b/core/models.py index 95cc8ab3..66f93829 100755 --- a/core/models.py +++ b/core/models.py @@ -1,7 +1,5 @@ -import csv import json import os -import logging from datetime import datetime from django.contrib.auth import get_user_model @@ -12,8 +10,6 @@ from wagtail.admin.panels import FieldPanel, ObjectList, TabbedInterface from wagtail.fields import RichTextField from wagtail.models import ClusterableModel -from wagtail.search import index -from wagtail.snippets.models import register_snippet from wagtailautocomplete.edit_handlers import AutocompletePanel from core import choices @@ -282,6 +278,7 @@ class RawOrganizationMixin(models.Model): Mixin for storing raw, unstructured organization data. Intended to replace references to institution.models.Institution. """ + raw_text = models.TextField( _("Raw Text"), null=True, @@ -401,7 +398,9 @@ def get_or_create( ) @classmethod - def create(cls, raw_text, initial_date=None, final_date=None, user=None, institution=None): + def create( + cls, raw_text, initial_date=None, final_date=None, user=None, institution=None + ): history = cls() history.raw_text = raw_text history.creator = user @@ -422,7 +421,7 @@ def institution_name(self): return self.raw_institution_name if self.raw_text: return self.raw_text - if hasattr(self, 'institution') and self.institution: + if hasattr(self, "institution") and self.institution: # Fallback para o campo institution (deprecated) try: return self.institution.institution_name @@ -451,8 +450,11 @@ class OrganizationNameMixin(models.Model): name: The organization's full name acronym: The organization's acronym """ + name = models.CharField(_("Name"), max_length=255) - acronym = models.CharField(_("Institution Acronym"), max_length=20, null=True, blank=True) + acronym = models.CharField( + _("Institution Acronym"), max_length=20, null=True, blank=True + ) autocomplete_search_field = "name" @@ -488,6 +490,7 @@ class VisualIdentityMixin(models.Model): logo: The organization's logo image url: The organization's website URL """ + url = models.URLField(_("URL"), blank=True, null=True) logo = models.ImageField(_("Logo"), blank=True, null=True) @@ -736,7 +739,9 @@ def create(cls, user, license, language=None, url=None, license_p=None): @classmethod def create_or_update(cls, user, license, language=None, url=None, license_p=None): if not license: - raise ValueError("LicenseStatement.create_or_update requires license parameter") + raise ValueError( + "LicenseStatement.create_or_update requires license parameter" + ) try: obj = cls.get(license=license, language=language) obj.updated_by = user @@ -773,7 +778,7 @@ def parse_url(url): continue license_type = part - remaining = url_parts[i + 1:] + remaining = url_parts[i + 1 :] license_version = None license_language = None @@ -872,7 +877,9 @@ def final_date_isoformat(self): class BaseDateRange(models.Model): # Used to replace BaseHistory, which will be DEPRECATED # Uso de datas em formato YYYY-MM-DD, YYYY-MM ou YYYY adotado por SciELO - initial_date = models.CharField(_("Initial Date"), max_length=10, null=True, blank=True) + initial_date = models.CharField( + _("Initial Date"), max_length=10, null=True, blank=True + ) final_date = models.CharField(_("Final Date"), max_length=10, null=True, blank=True) panels = [ @@ -1152,7 +1159,7 @@ def finish( for k, v in self.detail.items(): try: json.dumps(v) - except Exception as e: + except Exception: self.detail[k] = str(v) self.updated = datetime.utcnow() self.updated_by = user @@ -1219,6 +1226,7 @@ class BaseLegacyRecord(CommonControlField): from: https://articlemeta.scielo.org/api/v1/journal/?collection={collection}&issn={issn}" """ + STATUS_CHOICES = [ ("pending", _("Pending")), ("todo", _("To Do")), @@ -1246,7 +1254,7 @@ class BaseLegacyRecord(CommonControlField): max_length=10, null=True, blank=True, - help_text=_("Date in YYYY-MM-DD format") + help_text=_("Date in YYYY-MM-DD format"), ) status = models.CharField( _("Status"), @@ -1287,12 +1295,28 @@ def get(cls, pid, collection): try: return cls.objects.get(pid=pid, collection=collection) except cls.MultipleObjectsReturned: - return cls.objects.filter(pid=pid, collection=collection).order_by("-updated").first() + return ( + cls.objects.filter(pid=pid, collection=collection) + .order_by("-updated") + .first() + ) @classmethod - def create(cls, pid, collection, data=None, user=None, url=None, processing_date=None, status=None, new_record=None): + def create( + cls, + pid, + collection, + data=None, + user=None, + url=None, + processing_date=None, + status=None, + new_record=None, + ): if not pid or not collection or not user: - raise ValueError(f"{cls.__name__} create requires pid {pid}, collection {collection}, user {user}") + raise ValueError( + f"{cls.__name__} create requires pid {pid}, collection {collection}, user {user}" + ) obj = cls() obj.pid = pid obj.collection = collection @@ -1311,14 +1335,38 @@ def create(cls, pid, collection, data=None, user=None, url=None, processing_date return obj @classmethod - def create_or_update(cls, pid, collection, data=None, user=None, url=None, status=None, processing_date=None, force_update=None, new_record=None): + def create_or_update( + cls, + pid, + collection, + data=None, + user=None, + url=None, + status=None, + processing_date=None, + force_update=None, + new_record=None, + ): try: obj = cls.get(pid=pid, collection=collection) obj.updated_by = user except cls.DoesNotExist: - return cls.create(pid, collection, data, user, url=url, processing_date=processing_date, status=status, new_record=new_record) + return cls.create( + pid, + collection, + data, + user, + url=url, + processing_date=processing_date, + status=status, + new_record=new_record, + ) except cls.MultipleObjectsReturned: - obj = cls.objects.filter(pid=pid, collection=collection).order_by("-updated").first() + obj = ( + cls.objects.filter(pid=pid, collection=collection) + .order_by("-updated") + .first() + ) obj.updated_by = user if processing_date and processing_date == obj.processing_date: From 485bbf569a36fd6d89635a7532522917a02a44c1 Mon Sep 17 00:00:00 2001 From: Samuel Veiga Rangel Date: Thu, 11 Jun 2026 14:22:19 -0300 Subject: [PATCH 13/13] black --- pid_provider/models.py | 151 ++++++++++++++++++---------------- pid_provider/tasks.py | 2 - pid_provider/wagtail_hooks.py | 10 ++- 3 files changed, 87 insertions(+), 76 deletions(-) diff --git a/pid_provider/models.py b/pid_provider/models.py index cb67abda..1d131b36 100644 --- a/pid_provider/models.py +++ b/pid_provider/models.py @@ -1,25 +1,21 @@ import io -import json import logging import os import sys -import traceback import zipfile from datetime import datetime -from functools import lru_cache, cached_property +from functools import cached_property from zlib import crc32 from django.core.files.base import ContentFile from django.db import IntegrityError, models -from django.db.models import Q, Count, Min +from django.db.models import Count, Q from django.utils.translation import gettext_lazy as _ from modelcluster.fields import ParentalKey from modelcluster.models import ClusterableModel from packtools.sps.pid_provider import v3_gen, xml_sps_adapter from packtools.sps.pid_provider.xml_sps_lib import XMLWithPre from wagtail.admin.panels import FieldPanel, InlinePanel, ObjectList, TabbedInterface -from wagtail.fields import RichTextField -from wagtail.models import Orderable from wagtailautocomplete.edit_handlers import AutocompletePanel from collection.models import Collection @@ -34,9 +30,9 @@ from core.utils.similarity import how_similar from pid_provider import choices, exceptions from pid_provider.query_params import ( + QueryBuilderPidProviderXML, get_score, zero_to_none, - QueryBuilderPidProviderXML, ) from tracker.models import BaseEvent, UnexpectedEvent @@ -110,7 +106,9 @@ class XMLVersion(CommonControlField): pid_provider_xml = models.ForeignKey( "PidProviderXML", null=True, blank=True, on_delete=models.SET_NULL ) - file = models.FileField(upload_to=xml_directory_path, null=True, blank=True, max_length=300) + file = models.FileField( + upload_to=xml_directory_path, null=True, blank=True, max_length=300 + ) finger_print = models.CharField(max_length=64, null=True, blank=True) class Meta: @@ -179,7 +177,7 @@ def xml(self): return self.xml_with_pre.tostring(pretty_print=True) except XMLVersionXmlWithPreError as e: return str(e) - except FileNotFoundError as e: + except FileNotFoundError: return None @classmethod @@ -206,7 +204,7 @@ def get_or_create(cls, user, pid_provider_xml, xml_with_pre): latest = cls.get(pid_provider_xml, xml_with_pre.finger_print) try: file_exist = os.path.isfile(latest.file.path) - except (AttributeError, TypeError, ValueError) as e: + except (AttributeError, TypeError, ValueError): file_exist = False if file_exist: return latest @@ -527,14 +525,14 @@ class Meta: def __str__(self): return f"{self.pkg_name} {self.v3}" - + @property def article_pid_suffix_source(self): try: return self.xml_with_pre.get_article_pid_suffix_source() except AttributeError: return self.elocation_id or self.fpage or self.xml_with_pre.order - + def get_article_pid_suffix(self): data = self.article_pid_suffix_source if not data: @@ -577,11 +575,15 @@ def get_queryset( @profile_classmethod def public_items(cls, from_date): now = datetime.utcnow().isoformat()[:10] - return cls.objects.select_related("current_version").filter( - (Q(available_since__isnull=True) | Q(available_since__lte=now)) - & (Q(created__gte=from_date) | Q(updated__gte=from_date)), - current_version__pid_provider_xml__v3__isnull=False, - ).iterator() + return ( + cls.objects.select_related("current_version") + .filter( + (Q(available_since__isnull=True) | Q(available_since__lte=now)) + & (Q(created__gte=from_date) | Q(updated__gte=from_date)), + current_version__pid_provider_xml__v3__isnull=False, + ) + .iterator() + ) @property def created_updated(self): @@ -695,9 +697,6 @@ def register( Parâmetros insuficientes para identificar documento """ try: - input_data = None - xml_adapter_data = None - response = {} response["input_data"] = xml_with_pre.data response["input_data"].update({"origin": origin}) @@ -710,9 +709,12 @@ def register( try: records = cls.get_records(xml_adapter) registered = cls.get_record(xml_adapter, records=records) - except cls.DoesNotExist as exc: + except cls.DoesNotExist: registered = None - except (cls.MultipleObjectsReturned, exceptions.UnmatchedPidProviderXMLError) as exc: + except ( + cls.MultipleObjectsReturned, + exceptions.UnmatchedPidProviderXMLError, + ) as exc: response["records"] = [item.data for item in records] raise exceptions.QueryDocumentMultipleObjectsReturnedError(exc) except ( @@ -972,8 +974,8 @@ def get_record_by_pid_v3(cls, xml_adapter): if not xml_adapter.v3: raise ValueError("get_record_by_pid_v3: XML has not pid v3") xml_pid_v3 = xml_adapter.v3 - results = ( - cls.objects.filter(Q(v3=xml_pid_v3) | Q(other_pid__pid_in_xml=xml_pid_v3)) + results = cls.objects.filter( + Q(v3=xml_pid_v3) | Q(other_pid__pid_in_xml=xml_pid_v3) ) if not results.exists(): raise cls.DoesNotExist @@ -983,7 +985,10 @@ def get_record_by_pid_v3(cls, xml_adapter): item=xml_adapter.sps_pkg_name, action="PidProviderXML.get_record_by_pid_v3", exception=PidProviderXMLPidV3ConflictError, - detail={"xml_adapter": xml_adapter.data, "results": [i.data for i in results]}, + detail={ + "xml_adapter": xml_adapter.data, + "results": [i.data for i in results], + }, ) raise PidProviderXMLPidV3ConflictError( _("No matching record found for the provided XML data.") @@ -992,8 +997,7 @@ def get_record_by_pid_v3(cls, xml_adapter): @profile_method def match(self, xml_adapter): - """ - """ + """ """ labels = [] score = self.title_similarity(xml_adapter) * 100 if score > 50: @@ -1007,7 +1011,9 @@ def match(self, xml_adapter): if score_item := get_score(self.z_links, xml_adapter.z_links, 10, 100): labels.append("z_links") score += score_item - if score_item := get_score(self.z_partial_body, xml_adapter.z_partial_body, 10, 100): + if score_item := get_score( + self.z_partial_body, xml_adapter.z_partial_body, 10, 100 + ): labels.append("z_partial_body") score += score_item return {"score": score, "labels": labels} @@ -1064,7 +1070,7 @@ def best_matches(cls, results, xml_adapter): "xml_adapter_data": xml_adapter.data, "data": data, "matched": matched, - } + } UnexpectedEvent.create( item=xml_adapter.sps_pkg_name, action="PidProviderXML.best_matches", @@ -1102,7 +1108,7 @@ def _add_dates(self, xml_adapter, origin_date, available_since): self.available_since = available_since or ( xml_adapter.xml_with_pre.article_publication_date ) - except Exception as e: + except Exception: # packtools error self.available_since = origin_date self.origin_date = origin_date @@ -1124,7 +1130,7 @@ def _add_current_version(self, xml_with_pre, user, delete=False): if delete: try: self.current_version.delete() - except Exception as e: + except Exception: pass self.current_version = XMLVersion.get_or_create(user, self, xml_with_pre) @@ -1169,7 +1175,6 @@ def _add_other_pid(self, registered_changed, user): if not registered_changed: return for change_args in registered_changed: - change_args["pid_in_xml"] = change_args.pop("registered") change_args["user"] = user @@ -1240,12 +1245,15 @@ def is_registered( try: records = cls.get_records(xml_adapter) registered = cls.get_record(xml_adapter, records=records) - except cls.DoesNotExist as exc: + except cls.DoesNotExist: response.update( {"filename": xml_with_pre.filename, "registered": False} ) return response - except (cls.MultipleObjectsReturned, exceptions.UnmatchedPidProviderXMLError) as exc: + except ( + cls.MultipleObjectsReturned, + exceptions.UnmatchedPidProviderXMLError, + ) as exc: exc_type, exc_value, exc_traceback = sys.exc_info() response["records"] = [item.data for item in records] UnexpectedEvent.create( @@ -1281,7 +1289,7 @@ def is_registered( response.update({"error_msg": str(e), "error_type": str(type(e))}) return response return {} - + @classmethod def get_by_pid_v3(cls, pid_v3, partial_pid_v2=None, pid_v2=None): params = {} @@ -1293,7 +1301,7 @@ def get_by_pid_v3(cls, pid_v3, partial_pid_v2=None, pid_v2=None): params["v2__contains"] = partial_pid_v2 try: return cls.objects.get(**params) - except cls.MultipleObjectsReturned as e: + except cls.MultipleObjectsReturned: return cls.objects.filter(**params).order_by("-updated").first() @classmethod @@ -1336,10 +1344,8 @@ def mark_items_as_invalid(cls, issns): for item in cls.objects.filter( Q(issn_print__in=issns) | Q(issn_electronic__in=issns), ).iterator(): - try: - invalid = bool(item.xml_with_pre) - except Exception as e: - invalid = True + item.proc_status = choices.PPXML_STATUS_INVALID + item.save() @classmethod @profile_classmethod @@ -1451,11 +1457,13 @@ def fix_pkg_name(self, pkg_name): self.save() return True return False - + def add_event(self, name, proc_status, detail=None, errors=None, exceptions=None): self.proc_status = proc_status self.save() - return XMLEvent.register(self, name, detail=detail, errors=errors, exceptions=exceptions) + return XMLEvent.register( + self, name, detail=detail, errors=errors, exceptions=exceptions + ) class FixPidV2(CommonControlField): @@ -1602,26 +1610,26 @@ def get_or_create( def xml_url_zipfile_path(instance, filename): """ Generate the upload path for XMLURL zipfile. - + Args: instance: XMLURL instance filename: Name of the file - + Returns: Path string for file upload """ # Use URL hash to create a unique subdirectory - url_hash = abs(hash(instance.url)) % (10 ** 8) + url_hash = abs(hash(instance.url)) % (10**8) return f"pid_provider/xmlurl/{url_hash}/{filename}" class XMLURL(CommonControlField): """ Model to store URLs that experienced failures and should be retried in the future. - + This model tracks URLs that failed during processing, along with their status and associated article PID, enabling retry mechanisms to reprocess them later. - + Fields: url: URLField - The URL that needs to be retried status: CharField - To control the request status (e.g., "pending", "failed", "retrying") @@ -1630,17 +1638,15 @@ class XMLURL(CommonControlField): exceptions: CharField - Exception traceback information (truncated to 255 chars if needed) """ - url = models.URLField( - _("URL"), max_length=500, null=False, blank=False - ) - status = models.CharField( - _("Status"), max_length=50, null=True, blank=True - ) - pid = models.CharField( - _("Article PID"), max_length=23, null=True, blank=True - ) + url = models.URLField(_("URL"), max_length=500, null=False, blank=False) + status = models.CharField(_("Status"), max_length=50, null=True, blank=True) + pid = models.CharField(_("Article PID"), max_length=23, null=True, blank=True) zipfile = models.FileField( - _("ZIP File"), upload_to=xml_url_zipfile_path, null=True, blank=True, max_length=300, + _("ZIP File"), + upload_to=xml_url_zipfile_path, + null=True, + blank=True, + max_length=300, ) exceptions = models.CharField( _("Exceptions"), max_length=255, null=True, blank=True @@ -1729,30 +1735,32 @@ def create_or_update( def save_file(self, xml_content, filename=None): """ Create a zip file from XML content and save it to the zipfile field. - + Args: xml_content: str or bytes - The XML content to compress filename: str - Optional filename for the XML inside the zip (defaults to 'content.xml') - + Returns: bool - True if file was saved successfully, False otherwise """ try: # Convert string to bytes if needed if isinstance(xml_content, str): - xml_content = xml_content.encode('utf-8') - + xml_content = xml_content.encode("utf-8") + # Create in-memory zip file zip_buffer = io.BytesIO() - with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file: # Use provided filename or default - xml_filename = filename or 'content.xml' + xml_filename = filename or "content.xml" zip_file.writestr(xml_filename, xml_content) - + # Save the zip file to the model zip_filename = f"{self.pid or 'unknown'}_{self.pk or 'new'}.zip" - self.zipfile.save(zip_filename, ContentFile(zip_buffer.getvalue()), save=True) - + self.zipfile.save( + zip_filename, ContentFile(zip_buffer.getvalue()), save=True + ) + return True except Exception as e: logging.error(f"Error saving zip file for XMLURL {self.url}: {e}") @@ -1779,9 +1787,8 @@ class XMLEvent(BaseEvent, CommonControlField): create (classmethod): Creates and saves a new XMLEvent instance. finish: Marks the event as completed and optionally updates details, errors, or exceptions. """ - ppxml = ParentalKey( - PidProviderXML, on_delete=models.CASCADE, related_name="events" - ) + + ppxml = ParentalKey(PidProviderXML, on_delete=models.CASCADE, related_name="events") @classmethod def register(cls, ppxml, name, detail=None, errors=None, exceptions=None): @@ -1789,5 +1796,7 @@ def register(cls, ppxml, name, detail=None, errors=None, exceptions=None): obj.ppxml = ppxml obj.name = name completed = bool(not errors and not exceptions) - obj.finish(completed=completed, detail=detail, errors=errors, exceptions=exceptions) - return obj \ No newline at end of file + obj.finish( + completed=completed, detail=detail, errors=errors, exceptions=exceptions + ) + return obj diff --git a/pid_provider/tasks.py b/pid_provider/tasks.py index 0977eee1..8744b005 100644 --- a/pid_provider/tasks.py +++ b/pid_provider/tasks.py @@ -131,8 +131,6 @@ def task_fix_pid_provider_xmls_status( ) """ try: - user = _get_user(self.request, username=username, user_id=user_id) - # Validação: ao menos uma operação deve ser especificada operations = { "invalid": mark_as_invalid, diff --git a/pid_provider/wagtail_hooks.py b/pid_provider/wagtail_hooks.py index 22874258..046d2512 100644 --- a/pid_provider/wagtail_hooks.py +++ b/pid_provider/wagtail_hooks.py @@ -1,12 +1,16 @@ -from django.http import HttpResponseRedirect from django.utils.translation import gettext_lazy as _ -from wagtail import hooks from wagtail.snippets.models import register_snippet from wagtail.snippets.views.snippets import SnippetViewSetGroup from config.menu import get_menu_order from core.views import CommonControlFieldViewSet -from pid_provider.models import XMLVersion, FixPidV2, OtherPid, PidProviderConfig, PidProviderXML +from pid_provider.models import ( + FixPidV2, + OtherPid, + PidProviderConfig, + PidProviderXML, + XMLVersion, +) class PidProviderXMLViewSet(CommonControlFieldViewSet):