Exploring the Jordanian Twittersphere¶
Data Collection¶
- We used the X (previously twitter)'s academic API to pull data from three different geolocations in Jordan:
Amman, Zarqa, Irbid
In [ ]:
import tweepy
import pandas as pd
BEARER_TOKEN = "your_bearer_token_here"
client = tweepy.Client(bearer_token=BEARER_TOKEN)
amman_geo = "35.9239,31.9522,30km"
irbid_geo = "37.2166,37.1666,30km"
zarqa_geo = "36.0833,32.7167,30km"
locations = {
'amman': amman_geo,
'irbid': irbid_geo,
'zarqa': zarqa_geo
}
for city, geo in locations.items():
query = f"point_radius:[{geo}]"
tweets = client.search_all_tweets(
query=query,
start_time="2023-05-01T00:00:00Z",
tweet_fields=['created_at', 'lang', 'author_id', 'geo', 'entities',
'public_metrics', 'referenced_tweets', 'in_reply_to_user_id'],
expansions=['author_id', 'geo.place_id', 'referenced_tweets.id',
'referenced_tweets.id.author_id'],
place_fields=['country', 'country_code', 'full_name', 'geo', 'name'],
user_fields=['location', 'username']
)
tweets_data = []
for tweet in tweets.data:
user = next((u for u in tweets.includes['users'] if u.id == tweet.author_id), None)
tweet_dict = {
'screen_name': user.username if user else None,
'user_location': user.location if user else None,
'id_str': str(tweet.id),
'created_at': tweet.created_at,
'favorite_count': tweet.public_metrics['like_count'],
'retweet_count': tweet.public_metrics['retweet_count'],
'text': tweet.text,
'geo': tweet.geo['coordinates'] if tweet.geo else None,
'coordinates': tweet.geo['coordinates'] if tweet.geo else None,
'entities': tweet.entities,
'place': tweet.geo['place_id'] if tweet.geo else None,
'in_reply_to_screen_name': user.username if tweet.in_reply_to_user_id else None,
'in_reply_to_user_id': tweet.in_reply_to_user_id,
'in_reply_to_status_id_str': None,
'is_quote_status': tweet.is_quote_status,
'lang': tweet.lang,
'quoted_status_id_str': tweet.quoted_status_id_str,
'retweeted_status_id_str': tweet.retweeted_status_id_str,
}
tweets_data.append(tweet_dict)
df = pd.DataFrame(tweets_data)
df.to_csv(f'tweets_{city}.csv', index=False)
In [ ]:
amman_df = pd.read_csv('tweets_amman.csv', dtype=str)
irbid_df = pd.read_csv('tweets_irbid.csv', dtype=str)
zarqa_df = pd.read_csv('tweets_zarqa.csv', dtype=str)
In [ ]:
# Add retweet columns to each dataframe
for df in [amman_df, irbid_df, zarqa_df]:
df['is_a_retweet'] = df['retweeted_status_id_str'].notna()
df['retweet_created_at'] = df['created_at'] if df['is_a_retweet'].any() else None
df['retweet_lang'] = df['lang'] if df['is_a_retweet'].any() else None
df['retweet_place'] = df['place'] if df['is_a_retweet'].any() else None
df['retweet_geo'] = df['geo'] if df['is_a_retweet'].any() else None
df['retweet_coordinates'] = df['coordinates'] if df['is_a_retweet'].any() else None
df['retweet_text'] = df['text'] if df['is_a_retweet'].any() else None
df['retweet_count.1'] = df['retweet_count'] if df['is_a_retweet'].any() else None
df['in_reply_to_screen_name.1'] = df['in_reply_to_screen_name'] if df['is_a_retweet'].any() else None
# Save updated dataframes
amman_df.to_csv('tweets_amman.csv', index=False)
irbid_df.to_csv('tweets_irbid.csv', index=False)
zarqa_df.to_csv('tweets_zarqa.csv', index=False)
Data Processing¶
- tweet cleaning
In [ ]:
import re
class Cleaner:
repeated_characters_pattern = re.compile(r'(.)\1{1,}')
arabic_diacritics_pattern = re.compile(r"[\u064B-\u0652\u0610-\u061A]")
unwanted_chars_pattern = re.compile( r"«|»|\"|\'|\xa0|'|”|“|ـ|\u202c|\u202e")
rt_pattern = re.compile(r"^(?=RT)(.{2})")
mention_pattern = re.compile(r"@[a-zA-Z_0-9]+:?")
link_pattern = re.compile(r"https?://\S+")
repeated_words_pattern = re.compile(r'(\w{2,})\1{1,}')
def __init__(self, limit=2):
pass
@staticmethod
def clean(text, limit = 2):
text = Cleaner.repeated_characters_pattern.sub(lambda match: match.group(1) * limit, text)
text = Cleaner.unwanted_chars_pattern.sub('', text)
text = Cleaner.arabic_diacritics_pattern.sub('', text)
text = Cleaner.repeated_words_pattern.sub(lambda match: match.group(1), text)
text = Cleaner.rt_pattern.sub('', text)
text = Cleaner.mention_pattern.sub('', text)
text = Cleaner.link_pattern.sub('', text)
return text
In [ ]:
amman_df['text_clean'] = amman_df['text'].apply(Cleaner.clean)
irbid_df['text_clean'] = irbid_df['text'].apply(Cleaner.clean)
zarqa_df['text_clean'] = zarqa_df['text'].apply(Cleaner.clean)
amman_df.to_csv('tweets_amman.csv', index=False)
irbid_df.to_csv('tweets_irbid.csv', index=False)
zarqa_df.to_csv('tweets_zarqa.csv', index=False)
- Normalizing date
In [ ]:
from datetime import datetime
def convert_datetime(date):
try:
datetime_obj = datetime.strptime(date, '%a %b %d %H:%M:%S %z %Y')
formatted_date = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')
return formatted_date
except:
return None
amman_df['created_at_datetime'] = amman_df['created_at'].apply(convert_datetime)
irbid_df['created_at_datetime'] = irbid_df['created_at'].apply(convert_datetime)
zarqa_df['created_at_datetime'] = zarqa_df['created_at'].apply(convert_datetime)
- Running language detection (on top of twitter's language detection)
In [ ]:
from langdetect import detect
def detect_language(text):
try:
return detect(text)
except:
return None
# Run language detection on cleaned text
amman_df['lang_detected'] = amman_df['text_clean'].apply(detect_language)
irbid_df['lang_detected'] = irbid_df['text_clean'].apply(detect_language)
zarqa_df['lang_detected'] = zarqa_df['text_clean'].apply(detect_language)
- taking only retweets
In [ ]:
amman_df_rt_only = amman_df[amman_df['is_a_retweet']]
irbid_df_rt_only = irbid_df[irbid_df['is_a_retweet']]
zarqa_df_rt_only = zarqa_df[zarqa_df['is_a_retweet']]
- downloading images from tweets
In [ ]:
import requests
import os
def download_image(url, save_path):
try:
response = requests.get(url)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
return True
except:
return False
return False
os.makedirs('images/amman', exist_ok=True)
os.makedirs('images/irbid', exist_ok=True)
os.makedirs('images/zarqa', exist_ok=True)
for idx, row in amman_df.iterrows():
if pd.notna(row.get('media_url')):
filename = f"images/amman/{row['id']}.jpg"
download_image(row['media_url'], filename)
for idx, row in irbid_df.iterrows():
if pd.notna(row.get('media_url')):
filename = f"images/irbid/{row['id']}.jpg"
download_image(row['media_url'], filename)
for idx, row in zarqa_df.iterrows():
if pd.notna(row.get('media_url')):
filename = f"images/zarqa/{row['id']}.jpg"
download_image(row['media_url'], filename)
- OCRing images
In [ ]:
import http.client, urllib.request, urllib.parse, urllib.error, base64
from dotenv import load_dotenv
import os
import json
import pandas as pd
def ocr_azure(subscription_key, image_url, feature_type):
headers = {
# Request headers
'Content-Type': 'application/json',
'Ocp-Apim-Subscription-Key': subscription_key,
}
params = urllib.parse.urlencode({
# Request parameters
'features': feature_type,
'gender-neutral-caption': 'False',
})
body = "{'url': '%s'}" % image_url
try:
conn = http.client.HTTPSConnection('*.cognitiveservices.azure.com')
conn.request("POST", "/computervision/imageanalysis:analyze?api-version=2023-02-01-preview&%s" % params, body, headers)
response = conn.getresponse()
data = response.read()
conn.close()
return data
except Exception as e:
print("[Errno {0}] {1}".format(e.errno, e.strerror))
subscription_key = 'AZURE_SUBSCRIPTION_KEY'
# Process images from both directories
def process_directory(directory, feature_type='ocr'):
results = []
for filename in os.listdir(directory):
if filename.endswith('.jpg'):
image_path = os.path.join(directory, filename)
tweet_id = filename.replace('.jpg', '')
image_url = f"file://{os.path.abspath(image_path)}"
ocr_result = ocr_azure(subscription_key, image_url, feature_type)
if ocr_result:
results.append({
'tweet_id': tweet_id,
'ocr_text': json.loads(ocr_result.decode('utf-8'))
})
return pd.DataFrame(results)
zarqa_results = process_directory('images/zarqa')
amman_results = process_directory('images/amman')
irbid_results = process_directory('images/irbid')
zarqa_results.to_csv('zarqa_ocr_results.csv', index=False)
amman_results.to_csv('amman_ocr_results.csv', index=False)
irbid_results.to_csv('irbid_ocr_results.csv', index=False)
- adding ocr text to the dataframe
In [ ]:
zarqa_ocr = pd.read_csv('zarqa_ocr_results.csv')
amman_ocr = pd.read_csv('amman_ocr_results.csv')
irbid_ocr = pd.read_csv('irbid_ocr_results.csv')
all_ocr = pd.concat([zarqa_ocr, amman_ocr, irbid_ocr])
ocr_dict = dict(zip(all_ocr['tweet_id'], all_ocr['ocr_text']))
df['image_content'] = df['id'].map(ocr_dict)
df['image_content'] = df['image_content'].fillna('')
- generate user retweet network
In [ ]:
all_retweets = pd.concat([amman_df_rt_only, irbid_df_rt_only, zarqa_df_rt_only])
network_data = []
for _, tweet in all_retweets.iterrows():
if pd.notna(tweet['screen_name']) and pd.notna(tweet['in_reply_to_screen_name']):
network_data.append({
'source': tweet['screen_name'],
'target': tweet['in_reply_to_screen_name'],
'lang': tweet['lang_detected']
})
network_df = pd.DataFrame(network_data)
network_df = network_df.groupby(['source', 'target']).size().reset_index(name='weight')
network_df.to_csv('jordan_twittersphere_retweet_network.csv', index=False)
User Network Analysis using Cytoscape¶
In this section, we take the retweet network and upload it to cytoscape for visualization and analysis.
we color user nodes based on the language they speak (Arabic or English or Other)
we run community detection on the network (using the louvain algorithm)
We import detected communities per user and tweet back here
In [ ]:
communities_df = pd.read_csv('jordan_twittersphere_communities_louvain.csv')
- we generate summaries for the top 20 communities using GPT-4
In [ ]:
from openai import OpenAI
import pandas as pd
client = OpenAI()
def get_openai_response(prompt):
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content
# Get top 20 communities by size
communities = communities_df.value_counts("community")
top_20_communities = communities.sort_values(ascending=False).head(20)
# Create empty list to store summaries
community_summaries = []
# Prepare prompts for GPT-4
for community in top_20_communities.index:
# Get tweets from this community
community_tweets = all_retweets[all_retweets["screen_name"].isin(
communities_df[communities_df["community"] == community]["username"]
)]
# Format tweets for prompt
tweets_text = community_tweets["text"].tolist()
tweets_sample = tweets_text[:50] # Take first 50 tweets as sample
prompt = f"""Analyze these tweets from a Twitter community in Jordan.
Provide a brief summary of:
1. The main topics discussed
2. The apparent interests/focus of this community
3. Any notable patterns in communication style
Tweets sample:
{'\n'.join(tweets_sample)}
"""
# Call GPT-4 API and store response
try:
response = get_openai_response(prompt, len(prompt))
community_summaries.append({
'community': community,
'size': communities[community],
'summary': response
})
except Exception as e:
print(f"Error processing community {community}: {str(e)}")
# Create DataFrame from summaries
summaries_df = pd.DataFrame(community_summaries)
summaries_df.to_csv('jordan_twittersphere_community_summaries.csv', index=False)
Cross-referencing communities with similarity in content¶
- we take the content within the top 20 communities.
- we use an embeddings model to generate embeddings for each tweet
- we use a similarity metric to cluster the tweets into communities
- we compare the communities we get from this method with the communities we got from the community detection on the retweet network
In [ ]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import euclidean
import torch
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
In [ ]:
#multilingual model to handle english and arabic tweets
model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_name).to("cuda")
In [ ]:
def get_embedding(text, model):
with torch.no_grad():
return model.encode([text], device="cuda")[0]
# Get tweets from top 20 communities
top_20_communities = communities.sort_values(ascending=False).head(20).index
community_tweets = df[df['community'].isin(top_20_communities)]
# Create list to store tweet data with embeddings
tweet_embeddings = []
# Process tweets in batches
for idx, row in community_tweets.iterrows():
try:
# Get embedding for cleaned text
embedding = get_embedding(row['text_cleaned'], model)
# Store tweet data with embedding
tweet_embeddings.append({
'tweet_id': row.name,
'text': row['text_cleaned'],
'community': row['community'],
'embedding': embedding
})
# Print progress every 100 tweets
if len(tweet_embeddings) % 100 == 0:
print(f"Processed {len(tweet_embeddings)} tweets")
except Exception as e:
print(f"Error processing tweet {row.name}: {str(e)}")
continue
# Convert to DataFrame
embeddings_df = pd.DataFrame(tweet_embeddings)
def calculate_avg_embedding(group):
return np.mean(np.stack(group['embedding'].values), axis=0)
community_avg_embeddings = embeddings_df.groupby('community').apply(calculate_avg_embedding)
def calculate_similarity(row):
community_embedding = community_avg_embeddings[row['community']]
similarity = cosine_similarity([row['embedding']], [community_embedding])[0][0]
return similarity
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(np.stack(df['embedding'].values))
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df['community'], cmap='viridis')
plt.colorbar(scatter)
plt.title('Tweet Embeddings Visualization')
plt.show()